@@ -94,44 +94,7 @@ pub struct LineIndex {
94
94
impl LineIndex {
95
95
/// Returns a `LineIndex` for the `text`.
96
96
pub fn new ( text : & str ) -> LineIndex {
97
- let mut newlines = Vec :: < TextSize > :: with_capacity ( 16 ) ;
98
- let mut line_wide_chars = IntMap :: < u32 , Box < [ WideChar ] > > :: default ( ) ;
99
-
100
- let mut wide_chars = Vec :: < WideChar > :: new ( ) ;
101
- let mut cur_row = TextSize :: from ( 0 ) ;
102
- let mut cur_col = TextSize :: from ( 0 ) ;
103
- let mut line = 0u32 ;
104
-
105
- for c in text. chars ( ) {
106
- let c_len = TextSize :: of ( c) ;
107
- cur_row += c_len;
108
- if c == '\n' {
109
- newlines. push ( cur_row) ;
110
-
111
- // Save any wide characters seen in the previous line
112
- if !wide_chars. is_empty ( ) {
113
- let cs = std:: mem:: take ( & mut wide_chars) . into_boxed_slice ( ) ;
114
- line_wide_chars. insert ( line, cs) ;
115
- }
116
-
117
- // Prepare for processing the next line
118
- cur_col = TextSize :: from ( 0 ) ;
119
- line += 1 ;
120
- continue ;
121
- }
122
-
123
- if !c. is_ascii ( ) {
124
- wide_chars. push ( WideChar { start : cur_col, end : cur_col + c_len } ) ;
125
- }
126
-
127
- cur_col += c_len;
128
- }
129
-
130
- // Save any wide characters seen in the last line
131
- if !wide_chars. is_empty ( ) {
132
- line_wide_chars. insert ( line, wide_chars. into_boxed_slice ( ) ) ;
133
- }
134
-
97
+ let ( newlines, line_wide_chars) = analyze_source_file ( text) ;
135
98
LineIndex {
136
99
newlines : newlines. into_boxed_slice ( ) ,
137
100
line_wide_chars,
@@ -235,3 +198,182 @@ impl LineIndex {
235
198
self . len
236
199
}
237
200
}
201
+
202
+ /// This is adapted from the rustc_span crate, https://github.com/rust-lang/rust/blob/master/compiler/rustc_span/src/analyze_source_file.rs
203
+ fn analyze_source_file ( src : & str ) -> ( Vec < TextSize > , IntMap < u32 , Box < [ WideChar ] > > ) {
204
+ assert ! ( src. len( ) < !0u32 as usize ) ;
205
+ let mut lines = vec ! [ ] ;
206
+ let mut line_wide_chars = IntMap :: < u32 , Vec < WideChar > > :: default ( ) ;
207
+
208
+ // Calls the right implementation, depending on hardware support available.
209
+ analyze_source_file_dispatch ( src, & mut lines, & mut line_wide_chars) ;
210
+
211
+ ( lines, line_wide_chars. into_iter ( ) . map ( |( k, v) | ( k, v. into_boxed_slice ( ) ) ) . collect ( ) )
212
+ }
213
+
214
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
215
+ fn analyze_source_file_dispatch (
216
+ src : & str ,
217
+ lines : & mut Vec < TextSize > ,
218
+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
219
+ ) {
220
+ if is_x86_feature_detected ! ( "sse2" ) {
221
+ // SAFETY: SSE2 support was checked
222
+ unsafe {
223
+ analyze_source_file_sse2 ( src, lines, multi_byte_chars) ;
224
+ }
225
+ } else {
226
+ analyze_source_file_generic ( src, src. len ( ) , TextSize :: from ( 0 ) , lines, multi_byte_chars) ;
227
+ }
228
+ }
229
+
230
+ /// Checks 16 byte chunks of text at a time. If the chunk contains
231
+ /// something other than printable ASCII characters and newlines, the
232
+ /// function falls back to the generic implementation. Otherwise it uses
233
+ /// SSE2 intrinsics to quickly find all newlines.
234
+ #[ target_feature( enable = "sse2" ) ]
235
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
236
+ unsafe fn analyze_source_file_sse2 (
237
+ src : & str ,
238
+ lines : & mut Vec < TextSize > ,
239
+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
240
+ ) {
241
+ #[ cfg( target_arch = "x86" ) ]
242
+ use std:: arch:: x86:: * ;
243
+ #[ cfg( target_arch = "x86_64" ) ]
244
+ use std:: arch:: x86_64:: * ;
245
+
246
+ const CHUNK_SIZE : usize = 16 ;
247
+
248
+ let src_bytes = src. as_bytes ( ) ;
249
+
250
+ let chunk_count = src. len ( ) / CHUNK_SIZE ;
251
+
252
+ // This variable keeps track of where we should start decoding a
253
+ // chunk. If a multi-byte character spans across chunk boundaries,
254
+ // we need to skip that part in the next chunk because we already
255
+ // handled it.
256
+ let mut intra_chunk_offset = 0 ;
257
+
258
+ for chunk_index in 0 ..chunk_count {
259
+ let ptr = src_bytes. as_ptr ( ) as * const __m128i ;
260
+ // We don't know if the pointer is aligned to 16 bytes, so we
261
+ // use `loadu`, which supports unaligned loading.
262
+ let chunk = _mm_loadu_si128 ( ptr. add ( chunk_index) ) ;
263
+
264
+ // For character in the chunk, see if its byte value is < 0, which
265
+ // indicates that it's part of a UTF-8 char.
266
+ let multibyte_test = _mm_cmplt_epi8 ( chunk, _mm_set1_epi8 ( 0 ) ) ;
267
+ // Create a bit mask from the comparison results.
268
+ let multibyte_mask = _mm_movemask_epi8 ( multibyte_test) ;
269
+
270
+ // If the bit mask is all zero, we only have ASCII chars here:
271
+ if multibyte_mask == 0 {
272
+ assert ! ( intra_chunk_offset == 0 ) ;
273
+
274
+ // Check for newlines in the chunk
275
+ let newlines_test = _mm_cmpeq_epi8 ( chunk, _mm_set1_epi8 ( b'\n' as i8 ) ) ;
276
+ let newlines_mask = _mm_movemask_epi8 ( newlines_test) ;
277
+
278
+ if newlines_mask != 0 {
279
+ // All control characters are newlines, record them
280
+ let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
281
+ let output_offset = TextSize :: from ( ( chunk_index * CHUNK_SIZE + 1 ) as u32 ) ;
282
+
283
+ loop {
284
+ let index = newlines_mask. trailing_zeros ( ) ;
285
+
286
+ if index >= CHUNK_SIZE as u32 {
287
+ // We have arrived at the end of the chunk.
288
+ break ;
289
+ }
290
+
291
+ lines. push ( TextSize :: from ( index) + output_offset) ;
292
+
293
+ // Clear the bit, so we can find the next one.
294
+ newlines_mask &= ( !1 ) << index;
295
+ }
296
+ }
297
+ continue ;
298
+ }
299
+
300
+ // The slow path.
301
+ // There are control chars in here, fallback to generic decoding.
302
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
303
+ intra_chunk_offset = analyze_source_file_generic (
304
+ & src[ scan_start..] ,
305
+ CHUNK_SIZE - intra_chunk_offset,
306
+ TextSize :: from ( scan_start as u32 ) ,
307
+ lines,
308
+ multi_byte_chars,
309
+ ) ;
310
+ }
311
+
312
+ // There might still be a tail left to analyze
313
+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
314
+ if tail_start < src. len ( ) {
315
+ analyze_source_file_generic (
316
+ & src[ tail_start..] ,
317
+ src. len ( ) - tail_start,
318
+ TextSize :: from ( tail_start as u32 ) ,
319
+ lines,
320
+ multi_byte_chars,
321
+ ) ;
322
+ }
323
+ }
324
+
325
+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
326
+ // The target (or compiler version) does not support SSE2 ...
327
+ fn analyze_source_file_dispatch (
328
+ src : & str ,
329
+ lines : & mut Vec < TextSize > ,
330
+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
331
+ ) {
332
+ analyze_source_file_generic ( src, src. len ( ) , TextSize :: from ( 0 ) , lines, multi_byte_chars) ;
333
+ }
334
+
335
+ // `scan_len` determines the number of bytes in `src` to scan. Note that the
336
+ // function can read past `scan_len` if a multi-byte character start within the
337
+ // range but extends past it. The overflow is returned by the function.
338
+ fn analyze_source_file_generic (
339
+ src : & str ,
340
+ scan_len : usize ,
341
+ output_offset : TextSize ,
342
+ lines : & mut Vec < TextSize > ,
343
+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
344
+ ) -> usize {
345
+ assert ! ( src. len( ) >= scan_len) ;
346
+ let mut i = 0 ;
347
+ let src_bytes = src. as_bytes ( ) ;
348
+
349
+ while i < scan_len {
350
+ let byte = unsafe {
351
+ // We verified that i < scan_len <= src.len()
352
+ * src_bytes. get_unchecked ( i)
353
+ } ;
354
+
355
+ // How much to advance in order to get to the next UTF-8 char in the
356
+ // string.
357
+ let mut char_len = 1 ;
358
+
359
+ if byte == b'\n' {
360
+ lines. push ( TextSize :: from ( i as u32 + 1 ) + output_offset) ;
361
+ } else if byte >= 127 {
362
+ // The slow path: Just decode to `char`.
363
+ let c = src[ i..] . chars ( ) . next ( ) . unwrap ( ) ;
364
+ char_len = c. len_utf8 ( ) ;
365
+
366
+ let pos = TextSize :: from ( i as u32 ) + output_offset;
367
+
368
+ if char_len > 1 {
369
+ assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
370
+ let mbc = WideChar { start : pos, end : pos + TextSize :: from ( char_len as u32 ) } ;
371
+ multi_byte_chars. entry ( lines. len ( ) as u32 ) . or_default ( ) . push ( mbc) ;
372
+ }
373
+ }
374
+
375
+ i += char_len;
376
+ }
377
+
378
+ i - scan_len
379
+ }
0 commit comments