@@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> {
125
125
}
126
126
}
127
127
128
+ /// Temporary buffer used for encoding_rs output
128
129
enum Buffer {
130
+ /// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path.
131
+ Stack ( [ u8 ; 63 ] ) , // leave a byte for the enum's tag, so that the enum has 64-byte size
132
+ /// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up
129
133
Heap ( Vec < u8 > ) ,
130
- Stack ( [ u8 ; 63 ] ) , // leave a byte for the tag
134
+ }
135
+
136
+ impl Buffer {
137
+ /// Arbitrary limit when to switch from a small on-stack buffer to heap allocation
138
+ const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER : usize = 1 << 20 ;
139
+
140
+ /// Arbitrary, about a page size
141
+ const DEFAULT_HEAP_BUFFER_SIZE : usize = 4096 ;
142
+
143
+ fn buffer_for_length ( & mut self , content_len : usize ) -> & mut [ u8 ] {
144
+ let buffer = match self {
145
+ Buffer :: Heap ( buf) => buf. as_mut_slice ( ) ,
146
+ // Long non-ASCII content could take lots of roundtrips through the encoder
147
+ buf if content_len >= Self :: CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => {
148
+ * buf = Buffer :: Heap ( vec ! [ 0 ; Self :: DEFAULT_HEAP_BUFFER_SIZE ] ) ;
149
+ match buf {
150
+ Buffer :: Heap ( buf) => buf. as_mut ( ) ,
151
+ _ => unreachable ! ( ) ,
152
+ }
153
+ }
154
+ Buffer :: Stack ( buf) => buf. as_mut_slice ( ) ,
155
+ } ;
156
+ buffer
157
+ }
131
158
}
132
159
133
160
struct TextEncoder {
@@ -152,6 +179,7 @@ impl TextEncoder {
152
179
#[ inline( never) ]
153
180
fn encode ( & mut self , mut content : & str , output_handler : & mut dyn FnMut ( & [ u8 ] ) ) {
154
181
loop {
182
+ // First, fast path for ASCII-only prefix
155
183
debug_assert ! ( !self . encoder. has_pending_state( ) ) ; // ASCII-compatible encodings are not supposed to have it
156
184
let ascii_len = Encoding :: ascii_valid_up_to ( content. as_bytes ( ) ) ;
157
185
if let Some ( ( ascii, remainder) ) = content. split_at_checked ( ascii_len) {
@@ -164,41 +192,34 @@ impl TextEncoder {
164
192
content = remainder;
165
193
}
166
194
167
- let buffer = match & mut self . buffer {
168
- Buffer :: Heap ( buf) => buf. as_mut_slice ( ) ,
169
- // Long non-ASCII content could take lots of roundtrips through the encoder
170
- buf if content. len ( ) >= 1 << 20 => {
171
- * buf = Buffer :: Heap ( vec ! [ 0 ; 4096 ] ) ;
172
- match buf {
173
- Buffer :: Heap ( buf) => buf. as_mut ( ) ,
174
- _ => unreachable ! ( ) ,
175
- }
176
- }
177
- Buffer :: Stack ( buf) => buf. as_mut_slice ( ) ,
178
- } ;
195
+ // Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to.
196
+ let buffer = self . buffer . buffer_for_length ( content. len ( ) ) ;
179
197
198
+ // last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow
180
199
let ( result, read, written, _) = self . encoder . encode_from_utf8 ( content, buffer, false ) ;
200
+
181
201
if written > 0 && written <= buffer. len ( ) {
182
202
( output_handler) ( & buffer[ ..written] ) ;
183
203
}
184
204
if read >= content. len ( ) {
185
205
return ;
186
206
}
187
207
content = & content[ read..] ;
208
+
188
209
match result {
189
210
CoderResult :: InputEmpty => {
190
211
debug_assert ! ( content. is_empty( ) ) ;
191
212
return ;
192
213
}
214
+ // we've made progress, and can try again without growing the buffer
215
+ CoderResult :: OutputFull if written > 0 => { }
193
216
CoderResult :: OutputFull => {
194
- match & mut self . buffer {
195
- Buffer :: Heap ( buf) if buf. len ( ) >= 1024 => {
196
- if written == 0 {
197
- panic ! ( "encoding_rs infinite loop" ) ; // encoding_rs only needs a dozen bytes
198
- }
199
- }
200
- buf => * buf = Buffer :: Heap ( vec ! [ 0 ; 1024 ] ) ,
201
- }
217
+ // encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug.
218
+ assert ! (
219
+ buffer. len( ) < Buffer :: DEFAULT_HEAP_BUFFER_SIZE ,
220
+ "encoding_rs infinite loop"
221
+ ) ;
222
+ self . buffer = Buffer :: Heap ( vec ! [ 0 ; Buffer :: DEFAULT_HEAP_BUFFER_SIZE ] ) ;
202
223
}
203
224
}
204
225
}
@@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 {
213
234
b. leading_ones ( ) as _
214
235
}
215
236
237
+ /// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments
216
238
struct IncompleteUtf8Resync {
217
- bytes : [ u8 ; 4 ] ,
218
- len : u8 ,
239
+ /// Buffers an incomplete UTF-8 sequence
240
+ char_bytes : [ u8 ; 4 ] ,
241
+ /// Number of bytes in `bytes`
242
+ char_len : u8 ,
219
243
}
220
244
221
245
impl IncompleteUtf8Resync {
222
246
pub fn new ( ) -> Self {
223
247
Self {
224
- bytes : [ 0 ; 4 ] ,
225
- len : 0 ,
248
+ char_bytes : [ 0 ; 4 ] ,
249
+ char_len : 0 ,
226
250
}
227
251
}
228
252
253
+ /// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes.
254
+ ///
255
+ /// Call `discard_incomplete()` after the last write to flush any partially-written chars.
229
256
pub fn utf8_bytes_to_slice < ' buf , ' src : ' buf > (
230
257
& ' buf mut self ,
231
258
mut content : & ' src [ u8 ] ,
232
259
) -> Result < ( & ' buf str , & ' src [ u8 ] ) , Utf8Error > {
233
- if self . len > 0 {
234
- let mut found_end_byte = false ;
260
+ // There may be incomplete char buffered from previous write, that must be continued now
261
+ if self . char_len > 0 {
262
+ let mut must_emit_now = false ;
235
263
while let Some ( ( & next_byte, rest) ) = content. split_first ( ) {
236
264
if is_continuation_byte ( next_byte) {
237
- if let Some ( buf) = self . bytes . get_mut ( self . len as usize ) {
265
+ if let Some ( buf) = self . char_bytes . get_mut ( self . char_len as usize ) {
238
266
* buf = next_byte;
239
- self . len += 1 ;
267
+ self . char_len += 1 ;
240
268
content = rest;
241
269
continue ;
242
270
}
271
+ // overlong sequences fall here, and will be checked when the char_bytes is flushed
243
272
}
244
- found_end_byte = true ;
273
+ must_emit_now = true ;
245
274
break ;
246
275
}
247
276
248
- if found_end_byte || self . len >= utf8_width ( self . bytes [ 0 ] ) {
249
- let char_buf = self . bytes . get ( ..self . len as usize ) . ok_or ( Utf8Error ) ?;
250
- self . len = 0 ;
251
- std:: str:: from_utf8 ( char_buf)
252
- . map_err ( |_| Utf8Error )
253
- . map ( |ch| ( ch, content) )
277
+ if self . char_len >= utf8_width ( self . char_bytes [ 0 ] ) {
278
+ must_emit_now = true ;
279
+ }
280
+
281
+ if must_emit_now {
282
+ let char_buf = self
283
+ . char_bytes
284
+ . get ( ..self . char_len as usize )
285
+ . ok_or ( Utf8Error ) ?;
286
+ self . char_len = 0 ;
287
+ let ch = std:: str:: from_utf8 ( char_buf) . map_err ( |_| Utf8Error ) ?;
288
+ Ok ( ( ch, content) )
254
289
} else {
290
+ // a partial write has ended without fully completing a char (it's possible to write 1 byte at a time)
255
291
debug_assert ! ( content. is_empty( ) ) ;
256
292
Ok ( ( "" , b"" ) )
257
293
}
@@ -264,11 +300,12 @@ impl IncompleteUtf8Resync {
264
300
let ( valid, invalid) = content
265
301
. split_at_checked ( err. valid_up_to ( ) )
266
302
. ok_or ( Utf8Error ) ?;
267
- self . bytes
303
+ // save the incomplete bytes from the end for the next write
304
+ self . char_bytes
268
305
. get_mut ( ..invalid. len ( ) )
269
306
. ok_or ( Utf8Error ) ?
270
307
. copy_from_slice ( invalid) ;
271
- self . len = invalid. len ( ) as _ ;
308
+ self . char_len = invalid. len ( ) as _ ;
272
309
// valid_up_to promises it is valid
273
310
debug_assert ! ( std:: str :: from_utf8( valid) . is_ok( ) ) ;
274
311
let valid = unsafe { std:: str:: from_utf8_unchecked ( valid) } ;
@@ -280,8 +317,8 @@ impl IncompleteUtf8Resync {
280
317
281
318
/// True if there were incomplete invalid bytes in the buffer
282
319
pub fn discard_incomplete ( & mut self ) -> bool {
283
- if self . len > 0 {
284
- self . len = 0 ;
320
+ if self . char_len > 0 {
321
+ self . char_len = 0 ;
285
322
true
286
323
} else {
287
324
false
0 commit comments