From 237a637889ba5f38f5e12fc5e0aa3694a22017d2 Mon Sep 17 00:00:00 2001 From: Markus Reiter Date: Mon, 7 Apr 2025 01:10:59 +0200 Subject: [PATCH 1/3] Make `core::unicode::printable` more readable. --- library/core/src/unicode/printable.py | 48 +++++++-------- library/core/src/unicode/printable.rs | 84 +++++++++------------------ 2 files changed, 49 insertions(+), 83 deletions(-) diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py index 260fa9f9e6ad2..b21ad42067f37 100755 --- a/library/core/src/unicode/printable.py +++ b/library/core/src/unicode/printable.py @@ -178,8 +178,8 @@ def main(): else: normal0.append((a, b - a)) - singletons0u, singletons0l = compress_singletons(singletons0) - singletons1u, singletons1l = compress_singletons(singletons1) + SINGLETONS0_UPPER, SINGLETONS0_LOWER = compress_singletons(singletons0) + SINGLETONS1_UPPER, SINGLETONS1_LOWER = compress_singletons(singletons1) normal0 = compress_normal(normal0) normal1 = compress_normal(normal1) @@ -187,21 +187,21 @@ def main(): // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { +fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { + let x_upper = (x >> 8) as u8; + let mut lower_start = 0; + for &(upper, lower_count) in singletons_upper { + let lower_end = lower_start + lower_count as usize; + if x_upper == upper { + for &lower in &singletons_lower[lower_start..lower_end] { if lower == x as u8 { return false; } } - } else if xupper < upper { + } else if x_upper < upper { break; } - lowerstart = lowerend; + lower_start = lower_end; } let mut x = x as i32; @@ -226,30 +226,22 @@ def main(): let x = x as u32; let lower = x as u16; - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else {\ + match x { + ..32 => false, // ASCII fast path + ..127 => true, // ASCII fast path + ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), + ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1),\ """) for a, b in extra: - print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) - print(" return false;") - print(" }") + print(" 0x{:x}..0x{:x} => false,".format(a, a + b)) print("""\ - true + _ => true, } }\ """) print() - print_singletons(singletons0u, singletons0l, "SINGLETONS0U", "SINGLETONS0L") - print_singletons(singletons1u, singletons1l, "SINGLETONS1U", "SINGLETONS1L") + print_singletons(SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER") + print_singletons(SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER") print_normal(normal0, "NORMAL0") print_normal(normal1, "NORMAL1") diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs index d8fb50e4ed296..be0295a13c2fa 100644 --- a/library/core/src/unicode/printable.rs +++ b/library/core/src/unicode/printable.rs @@ -1,21 +1,21 @@ // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { +fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { + let x_upper = (x >> 8) as u8; + let mut lower_start = 0; + for &(upper, lower_count) in singletons_upper { + let lower_end = lower_start + lower_count as usize; + if x_upper == upper { + for &lower in &singletons_lower[lower_start..lower_end] { if lower == x as u8 { return false; } } - } else if xupper < upper { + } else if x_upper < upper { break; } - lowerstart = lowerend; + lower_start = lower_end; } let mut x = x as i32; @@ -40,53 +40,27 @@ pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else { - if 0x2a6e0 <= x && x < 0x2a700 { - return false; - } - if 0x2b73a <= x && x < 0x2b740 { - return false; - } - if 0x2b81e <= x && x < 0x2b820 { - return false; - } - if 0x2cea2 <= x && x < 0x2ceb0 { - return false; - } - if 0x2ebe1 <= x && x < 0x2ebf0 { - return false; - } - if 0x2ee5e <= x && x < 0x2f800 { - return false; - } - if 0x2fa1e <= x && x < 0x30000 { - return false; - } - if 0x3134b <= x && x < 0x31350 { - return false; - } - if 0x323b0 <= x && x < 0xe0100 { - return false; - } - if 0xe01f0 <= x && x < 0x110000 { - return false; - } - true + match x { + ..32 => false, // ASCII fast path + ..127 => true, // ASCII fast path + ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), + ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1), + 0x2a6e0..0x2a700 => false, + 0x2b73a..0x2b740 => false, + 0x2b81e..0x2b820 => false, + 0x2cea2..0x2ceb0 => false, + 0x2ebe1..0x2ebf0 => false, + 0x2ee5e..0x2f800 => false, + 0x2fa1e..0x30000 => false, + 0x3134b..0x31350 => false, + 0x323b0..0xe0100 => false, + 0xe01f0..0x110000 => false, + _ => true, } } #[rustfmt::skip] -const SINGLETONS0U: &[(u8, u8)] = &[ +const SINGLETONS0_UPPER: &[(u8, u8)] = &[ (0x00, 1), (0x03, 5), (0x05, 6), @@ -129,7 +103,7 @@ const SINGLETONS0U: &[(u8, u8)] = &[ (0xff, 9), ]; #[rustfmt::skip] -const SINGLETONS0L: &[u8] = &[ +const SINGLETONS0_LOWER: &[u8] = &[ 0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90, 0x1c, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f, 0x5c, @@ -169,7 +143,7 @@ const SINGLETONS0L: &[u8] = &[ 0xfe, 0xff, ]; #[rustfmt::skip] -const SINGLETONS1U: &[(u8, u8)] = &[ +const SINGLETONS1_UPPER: &[(u8, u8)] = &[ (0x00, 6), (0x01, 1), (0x03, 1), @@ -216,7 +190,7 @@ const SINGLETONS1U: &[(u8, u8)] = &[ (0xfb, 1), ]; #[rustfmt::skip] -const SINGLETONS1L: &[u8] = &[ +const SINGLETONS1_LOWER: &[u8] = &[ 0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x7b, 0x8b, 0x93, 0x96, 0xa2, 0xb2, 0xba, 0x86, 0xb1, 0x06, 0x07, 0x09, 0x36, 0x3d, From 32034e8e7e391c620e241c9a6b5be79d8ca7d14e Mon Sep 17 00:00:00 2001 From: Markus Reiter Date: Mon, 7 Apr 2025 01:16:30 +0200 Subject: [PATCH 2/3] Optimize `core::unicode::printable`. --- library/core/src/unicode/printable.py | 75 +++++++++++++++++++++++---- library/core/src/unicode/printable.rs | 67 ++++++++++++++++++++---- 2 files changed, 122 insertions(+), 20 deletions(-) diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py index b21ad42067f37..8ea82ded9b36f 100755 --- a/library/core/src/unicode/printable.py +++ b/library/core/src/unicode/printable.py @@ -187,14 +187,28 @@ def main(): // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { - let x_upper = (x >> 8) as u8; +/// # Safety +/// +/// - The sum of all lengths (i.e. the second field of each pair) in `singletons_upper` must be +/// equal to the length of `singletons_lower`. +/// - `normal` must be encoded such that lengths greater than `0x7f` consist of two bytes in big +/// endian, with the highest bit set and the length contained in the remaining 15 bits. +unsafe fn check( + x: u16, + singletons_upper: &[(u8, u8)], + singletons_lower: &[u8], + normal: &[u8], +) -> bool { + let [x_upper, x_lower] = x.to_be_bytes(); let mut lower_start = 0; for &(upper, lower_count) in singletons_upper { let lower_end = lower_start + lower_count as usize; - if x_upper == upper { - for &lower in &singletons_lower[lower_start..lower_end] { - if lower == x as u8 { + if upper == x_upper { + // SAFETY: The caller ensures that the sum of all lengths in `singletons_upper` + // is equal to the length of `singletons_lower`, so `lower_end` is guaranteed to be + // less than `singletons_lower.len()`. + for &lower in unsafe { singletons_lower.get_unchecked(lower_start..lower_end) } { + if lower == x_lower { return false; } } @@ -209,9 +223,14 @@ def main(): let mut current = true; while let Some(v) = normal.next() { let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 + let upper = v & 0x7f; + // SAFETY: The encoding of `normal` is guaranteed by the caller such that + // if the length is greater than 0x7f, it consists of two bytes, so there + // must be a next byte. + let lower = unsafe { normal.next().unwrap_unchecked() }; + i32::from(u16::from_be_bytes([upper, lower])) } else { - v as i32 + i32::from(v) }; x -= len; if x < 0 { @@ -229,8 +248,38 @@ def main(): match x { ..32 => false, // ASCII fast path ..127 => true, // ASCII fast path - ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), - ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1),\ + ..0x10000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS0_UPPER.len() { + lower_count_total += SINGLETONS0_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS0_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS0_UPPER` is equal + // to the length of `SINGLETONS0_LOWER`, and `NORMAL0` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0) } + } + ..0x20000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS1_UPPER.len() { + lower_count_total += SINGLETONS1_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS1_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS1_UPPER` is equal + // to the length of `SINGLETONS1_LOWER`, and `NORMAL1` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1) } + }\ """) for a, b in extra: print(" 0x{:x}..0x{:x} => false,".format(a, a + b)) @@ -240,8 +289,12 @@ def main(): }\ """) print() - print_singletons(SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER") - print_singletons(SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER") + print_singletons( + SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER" + ) + print_singletons( + SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER" + ) print_normal(normal0, "NORMAL0") print_normal(normal1, "NORMAL1") diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs index be0295a13c2fa..8cd891670cf0a 100644 --- a/library/core/src/unicode/printable.rs +++ b/library/core/src/unicode/printable.rs @@ -1,14 +1,28 @@ // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: &[u8]) -> bool { - let x_upper = (x >> 8) as u8; +/// # Safety +/// +/// - The sum of all lengths (i.e. the second field of each pair) in `singletons_upper` must be +/// equal to the length of `singletons_lower`. +/// - `normal` must be encoded such that lengths greater than `0x7f` consist of two bytes in big +/// endian, with the highest bit set and the length contained in the remaining 15 bits. +unsafe fn check( + x: u16, + singletons_upper: &[(u8, u8)], + singletons_lower: &[u8], + normal: &[u8], +) -> bool { + let [x_upper, x_lower] = x.to_be_bytes(); let mut lower_start = 0; for &(upper, lower_count) in singletons_upper { let lower_end = lower_start + lower_count as usize; - if x_upper == upper { - for &lower in &singletons_lower[lower_start..lower_end] { - if lower == x as u8 { + if upper == x_upper { + // SAFETY: The caller ensures that the sum of all lengths in `singletons_upper` + // is equal to the length of `singletons_lower`, so `lower_end` is guaranteed to be + // less than `singletons_lower.len()`. + for &lower in unsafe { singletons_lower.get_unchecked(lower_start..lower_end) } { + if lower == x_lower { return false; } } @@ -23,9 +37,14 @@ fn check(x: u16, singletons_upper: &[(u8, u8)], singletons_lower: &[u8], normal: let mut current = true; while let Some(v) = normal.next() { let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 + let upper = v & 0x7f; + // SAFETY: The encoding of `normal` is guaranteed by the caller such that + // if the length is greater than 0x7f, it consists of two bytes, so there + // must be a next byte. + let lower = unsafe { normal.next().unwrap_unchecked() }; + i32::from(u16::from_be_bytes([upper, lower])) } else { - v as i32 + i32::from(v) }; x -= len; if x < 0 { @@ -43,8 +62,38 @@ pub(crate) fn is_printable(x: char) -> bool { match x { ..32 => false, // ASCII fast path ..127 => true, // ASCII fast path - ..0x10000 => check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0), - ..0x20000 => check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1), + ..0x10000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS0_UPPER.len() { + lower_count_total += SINGLETONS0_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS0_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS0_UPPER` is equal + // to the length of `SINGLETONS0_LOWER`, and `NORMAL0` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0) } + } + ..0x20000 => { + const { + let mut lower_count_total = 0; + let mut i = 0; + while i < SINGLETONS1_UPPER.len() { + lower_count_total += SINGLETONS1_UPPER[i].1 as usize; + i += 1; + } + assert!(lower_count_total == SINGLETONS1_LOWER.len()); + } + // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS1_UPPER` is equal + // to the length of `SINGLETONS1_LOWER`, and `NORMAL1` is encoded such that lengths + // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and + // the length contained in the remaining 15 bits. + unsafe { check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1) } + } 0x2a6e0..0x2a700 => false, 0x2b73a..0x2b740 => false, 0x2b81e..0x2b820 => false, From 66ada05d5410260f8b0c468d4887f072428117e4 Mon Sep 17 00:00:00 2001 From: Markus Reiter Date: Fri, 30 May 2025 02:22:48 +0200 Subject: [PATCH 3/3] Reduce amount of generated `unsafe` code. --- library/core/src/unicode/printable.py | 127 ++++------------- library/core/src/unicode/printable.rs | 103 ++------------ library/core/src/unicode/printable/check.rs | 147 ++++++++++++++++++++ 3 files changed, 187 insertions(+), 190 deletions(-) create mode 100644 library/core/src/unicode/printable/check.rs diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py index 8ea82ded9b36f..d9d811cd95fee 100755 --- a/library/core/src/unicode/printable.py +++ b/library/core/src/unicode/printable.py @@ -92,8 +92,8 @@ def compress_singletons(singletons): def compress_normal(normal): - # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f - # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff + # lengths 0x00..0x7f are encoded as 0x00, 0x01, …, 0x7e, 0x7f + # lengths 0x80..0x7fff are encoded as 0x8080, 0x8081, …, 0xfffe, 0xffff compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] prev_start = 0 @@ -108,19 +108,19 @@ def compress_normal(normal): entry.append(0x80 | (truelen >> 8)) entry.append(truelen & 0xFF) else: - entry.append(truelen & 0x7F) + entry.append(truelen) if falselen > 0x7F: entry.append(0x80 | (falselen >> 8)) entry.append(falselen & 0xFF) else: - entry.append(falselen & 0x7F) + entry.append(falselen) compressed.append(entry) return compressed -def print_singletons(uppers, lowers, uppersname, lowersname): +def print_singletons(name, uppers, lowers, uppersname, lowersname): print("#[rustfmt::skip]") print("const {}: &[(u8, u8)] = &[".format(uppersname)) for u, c in uppers: @@ -134,13 +134,15 @@ def print_singletons(uppers, lowers, uppersname, lowersname): ) print("];") + print(f"const {name}: Singletons = Singletons::new(&{uppersname}, &{lowersname});") + def print_normal(normal, normalname): print("#[rustfmt::skip]") - print("const {}: &[u8] = &[".format(normalname)) + print("const {}: Normal = Normal::new(&[".format(normalname)) for v in normal: print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) - print("];") + print("]);") def main(): @@ -178,8 +180,8 @@ def main(): else: normal0.append((a, b - a)) - SINGLETONS0_UPPER, SINGLETONS0_LOWER = compress_singletons(singletons0) - SINGLETONS1_UPPER, SINGLETONS1_LOWER = compress_singletons(singletons1) + singletons0_upper, singletons0_lower = compress_singletons(singletons0) + singletons1_upper, singletons1_lower = compress_singletons(singletons1) normal0 = compress_normal(normal0) normal1 = compress_normal(normal1) @@ -187,102 +189,21 @@ def main(): // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -/// # Safety -/// -/// - The sum of all lengths (i.e. the second field of each pair) in `singletons_upper` must be -/// equal to the length of `singletons_lower`. -/// - `normal` must be encoded such that lengths greater than `0x7f` consist of two bytes in big -/// endian, with the highest bit set and the length contained in the remaining 15 bits. -unsafe fn check( - x: u16, - singletons_upper: &[(u8, u8)], - singletons_lower: &[u8], - normal: &[u8], -) -> bool { - let [x_upper, x_lower] = x.to_be_bytes(); - let mut lower_start = 0; - for &(upper, lower_count) in singletons_upper { - let lower_end = lower_start + lower_count as usize; - if upper == x_upper { - // SAFETY: The caller ensures that the sum of all lengths in `singletons_upper` - // is equal to the length of `singletons_lower`, so `lower_end` is guaranteed to be - // less than `singletons_lower.len()`. - for &lower in unsafe { singletons_lower.get_unchecked(lower_start..lower_end) } { - if lower == x_lower { - return false; - } - } - } else if x_upper < upper { - break; - } - lower_start = lower_end; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - let upper = v & 0x7f; - // SAFETY: The encoding of `normal` is guaranteed by the caller such that - // if the length is greater than 0x7f, it consists of two bytes, so there - // must be a next byte. - let lower = unsafe { normal.next().unwrap_unchecked() }; - i32::from(u16::from_be_bytes([upper, lower])) - } else { - i32::from(v) - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} +mod check; +use check::{Normal, Singletons}; pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; match x { - ..32 => false, // ASCII fast path - ..127 => true, // ASCII fast path - ..0x10000 => { - const { - let mut lower_count_total = 0; - let mut i = 0; - while i < SINGLETONS0_UPPER.len() { - lower_count_total += SINGLETONS0_UPPER[i].1 as usize; - i += 1; - } - assert!(lower_count_total == SINGLETONS0_LOWER.len()); - } - // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS0_UPPER` is equal - // to the length of `SINGLETONS0_LOWER`, and `NORMAL0` is encoded such that lengths - // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and - // the length contained in the remaining 15 bits. - unsafe { check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0) } - } - ..0x20000 => { - const { - let mut lower_count_total = 0; - let mut i = 0; - while i < SINGLETONS1_UPPER.len() { - lower_count_total += SINGLETONS1_UPPER[i].1 as usize; - i += 1; - } - assert!(lower_count_total == SINGLETONS1_LOWER.len()); - } - // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS1_UPPER` is equal - // to the length of `SINGLETONS1_LOWER`, and `NORMAL1` is encoded such that lengths - // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and - // the length contained in the remaining 15 bits. - unsafe { check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1) } - }\ + ..32 => false, // ASCII fast path. + ..127 => true, // ASCII fast path. + ..0x10000 => SINGLETONS0.check(lower) && NORMAL0.check(lower), + ..0x20000 => SINGLETONS1.check(lower) && NORMAL1.check(lower),\ """) for a, b in extra: - print(" 0x{:x}..0x{:x} => false,".format(a, a + b)) + print(" {:#x}..{:#x} => false,".format(a, a + b)) print("""\ _ => true, } @@ -290,10 +211,18 @@ def main(): """) print() print_singletons( - SINGLETONS0_UPPER, SINGLETONS0_LOWER, "SINGLETONS0_UPPER", "SINGLETONS0_LOWER" + "SINGLETONS0", + singletons0_upper, + singletons0_lower, + "SINGLETONS0_UPPER", + "SINGLETONS0_LOWER", ) print_singletons( - SINGLETONS1_UPPER, SINGLETONS1_LOWER, "SINGLETONS1_UPPER", "SINGLETONS1_LOWER" + "SINGLETONS1", + singletons1_upper, + singletons1_lower, + "SINGLETONS1_UPPER", + "SINGLETONS1_LOWER", ) print_normal(normal0, "NORMAL0") print_normal(normal1, "NORMAL1") diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs index 8cd891670cf0a..878ecbc436384 100644 --- a/library/core/src/unicode/printable.rs +++ b/library/core/src/unicode/printable.rs @@ -1,99 +1,18 @@ // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -/// # Safety -/// -/// - The sum of all lengths (i.e. the second field of each pair) in `singletons_upper` must be -/// equal to the length of `singletons_lower`. -/// - `normal` must be encoded such that lengths greater than `0x7f` consist of two bytes in big -/// endian, with the highest bit set and the length contained in the remaining 15 bits. -unsafe fn check( - x: u16, - singletons_upper: &[(u8, u8)], - singletons_lower: &[u8], - normal: &[u8], -) -> bool { - let [x_upper, x_lower] = x.to_be_bytes(); - let mut lower_start = 0; - for &(upper, lower_count) in singletons_upper { - let lower_end = lower_start + lower_count as usize; - if upper == x_upper { - // SAFETY: The caller ensures that the sum of all lengths in `singletons_upper` - // is equal to the length of `singletons_lower`, so `lower_end` is guaranteed to be - // less than `singletons_lower.len()`. - for &lower in unsafe { singletons_lower.get_unchecked(lower_start..lower_end) } { - if lower == x_lower { - return false; - } - } - } else if x_upper < upper { - break; - } - lower_start = lower_end; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - let upper = v & 0x7f; - // SAFETY: The encoding of `normal` is guaranteed by the caller such that - // if the length is greater than 0x7f, it consists of two bytes, so there - // must be a next byte. - let lower = unsafe { normal.next().unwrap_unchecked() }; - i32::from(u16::from_be_bytes([upper, lower])) - } else { - i32::from(v) - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} +mod check; +use check::{Normal, Singletons}; pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; match x { - ..32 => false, // ASCII fast path - ..127 => true, // ASCII fast path - ..0x10000 => { - const { - let mut lower_count_total = 0; - let mut i = 0; - while i < SINGLETONS0_UPPER.len() { - lower_count_total += SINGLETONS0_UPPER[i].1 as usize; - i += 1; - } - assert!(lower_count_total == SINGLETONS0_LOWER.len()); - } - // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS0_UPPER` is equal - // to the length of `SINGLETONS0_LOWER`, and `NORMAL0` is encoded such that lengths - // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and - // the length contained in the remaining 15 bits. - unsafe { check(lower, SINGLETONS0_UPPER, SINGLETONS0_LOWER, NORMAL0) } - } - ..0x20000 => { - const { - let mut lower_count_total = 0; - let mut i = 0; - while i < SINGLETONS1_UPPER.len() { - lower_count_total += SINGLETONS1_UPPER[i].1 as usize; - i += 1; - } - assert!(lower_count_total == SINGLETONS1_LOWER.len()); - } - // SAFETY: We just asserted that the sum of all lengths in `SINGLETONS1_UPPER` is equal - // to the length of `SINGLETONS1_LOWER`, and `NORMAL1` is encoded such that lengths - // greater than `0x7f` consist of two bytes in big endian, with the highest bit set and - // the length contained in the remaining 15 bits. - unsafe { check(lower, SINGLETONS1_UPPER, SINGLETONS1_LOWER, NORMAL1) } - } + ..32 => false, // ASCII fast path. + ..127 => true, // ASCII fast path. + ..0x10000 => SINGLETONS0.check(lower) && NORMAL0.check(lower), + ..0x20000 => SINGLETONS1.check(lower) && NORMAL1.check(lower), 0x2a6e0..0x2a700 => false, 0x2b73a..0x2b740 => false, 0x2b81e..0x2b820 => false, @@ -191,6 +110,7 @@ const SINGLETONS0_LOWER: &[u8] = &[ 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7, 0xfe, 0xff, ]; +const SINGLETONS0: Singletons = Singletons::new(&SINGLETONS0_UPPER, &SINGLETONS0_LOWER); #[rustfmt::skip] const SINGLETONS1_UPPER: &[(u8, u8)] = &[ (0x00, 6), @@ -267,8 +187,9 @@ const SINGLETONS1_LOWER: &[u8] = &[ 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0, 0xae, 0xaf, 0x6e, 0x6f, 0xdd, 0xde, 0x93, ]; +const SINGLETONS1: Singletons = Singletons::new(&SINGLETONS1_UPPER, &SINGLETONS1_LOWER); #[rustfmt::skip] -const NORMAL0: &[u8] = &[ +const NORMAL0: Normal = Normal::new(&[ 0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, @@ -402,9 +323,9 @@ const NORMAL0: &[u8] = &[ 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d, -]; +]); #[rustfmt::skip] -const NORMAL1: &[u8] = &[ +const NORMAL1: Normal = Normal::new(&[ 0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, @@ -624,4 +545,4 @@ const NORMAL1: &[u8] = &[ 0x1c, 0x06, 0x09, 0x07, 0x80, 0xfa, 0x84, 0x06, -]; +]); diff --git a/library/core/src/unicode/printable/check.rs b/library/core/src/unicode/printable/check.rs new file mode 100644 index 0000000000000..d62a57950448e --- /dev/null +++ b/library/core/src/unicode/printable/check.rs @@ -0,0 +1,147 @@ +use crate::{iter, slice}; + +/// A compact representation of Unicode singletons. +/// +/// This is basically a `&[u16]`, but represented as `&[(u8, &[u8])]`, +/// i.e. pairs of upper bytes and multiple corresponding lower bytes. +/// +/// However, in order to reduce the pointer-sized overhead for each nested +/// slice, it is compacted again into `&[(u8, u8)]` with the length of the +/// lower bytes in the second byte, and a separate, contiguous `&[u8]` for +/// storing the lower bytes. +pub(super) struct Singletons { + upper: &'static [(u8, u8)], + lower: &'static [u8], +} + +impl Singletons { + /// Creates a new `Singletons` instance from compacted upper and lower bytes. + /// + /// # Panics + /// + /// Panics if the sum of all lengths (i.e. the second field of each pair) in `upper` + /// is not equal to the length of `lower`. + pub(super) const fn new(upper: &'static [(u8, u8)], lower: &'static [u8]) -> Self { + let mut lower_count_total = 0; + let mut i = 0; + while i < upper.len() { + lower_count_total += upper[i].1 as usize; + i += 1; + } + assert!( + lower_count_total == lower.len(), + "Sum of lengths in `upper` does not match `lower` length." + ); + + Self { upper, lower } + } + + #[inline] + fn iter(&self) -> SingletonsIter { + SingletonsIter { iter: self.upper.iter().cloned(), lower: self.lower, lower_start: 0 } + } + + pub(super) fn check(&self, x: u16) -> bool { + let [x_upper, x_lower] = x.to_be_bytes(); + for (upper, lowers) in self.iter() { + if upper == x_upper { + for &lower in lowers { + if lower == x_lower { + return false; + } + } + } else if x_upper < upper { + break; + } + } + + true + } +} + +struct SingletonsIter { + iter: iter::Cloned>, + lower: &'static [u8], + lower_start: usize, +} + +impl Iterator for SingletonsIter { + type Item = (u8, &'static [u8]); + + fn next(&mut self) -> Option { + let (upper, lower_count) = self.iter.next()?; + + let lower_start = self.lower_start; + let lower_end = lower_start + lower_count as usize; + self.lower_start = lower_end; + + // SAFETY: The invariant for `Singletons` guarantees that the sum of all lengths + // in `upper` must be equal to the lengths of `lower`, so `lower_end` is guaranteed + // to be in range. + let lowers = unsafe { self.lower.get_unchecked(lower_start..lower_end) }; + + Some((upper, lowers)) + } +} + +/// A compact representation of lengths. +pub(super) struct Normal(&'static [u8]); + +impl Normal { + pub(super) const fn new(normal: &'static [u8]) -> Self { + // Invariant: Lengths greater than `0x7f` must be encoded as two bytes, + // with the length contained in the remaining 15 bits, i.e. `0x7fff`. + { + let mut i = 0; + while i < normal.len() { + if normal[i] & 0b1000_0000 != 0 { + assert!( + i + 1 < normal.len(), + "Length greater than `0x7f` is not encoded as two bytes." + ); + i += 2; + } else { + i += 1; + } + } + } + + Self(normal) + } + + #[inline] + fn iter(&self) -> NormalIter { + NormalIter { iter: self.0.iter().cloned() } + } + + pub(super) fn check(&self, mut x: u16) -> bool { + let mut current = true; + for len in self.iter() { + x = if let Some(x) = x.checked_sub(len) { x } else { break }; + current = !current; + } + current + } +} + +struct NormalIter { + iter: iter::Cloned>, +} + +impl Iterator for NormalIter { + type Item = u16; + + fn next(&mut self) -> Option { + let len = self.iter.next()?; + + Some(if len & 0b1000_0000 != 0 { + let upper = len & 0b0111_1111; + // SAFETY: The invariant of `Normal` guarantees that lengths are encoded + // as two bytes if greater than `0x7f`, so there must be a next byte. + let lower = unsafe { self.iter.next().unwrap_unchecked() }; + u16::from_be_bytes([upper, lower]) + } else { + u16::from(len) + }) + } +}