diff --git a/library/core/src/unicode/printable.py b/library/core/src/unicode/printable.py index 260fa9f9e6ad2..d9d811cd95fee 100755 --- a/library/core/src/unicode/printable.py +++ b/library/core/src/unicode/printable.py @@ -92,8 +92,8 @@ def compress_singletons(singletons): def compress_normal(normal): - # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f - # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff + # lengths 0x00..0x7f are encoded as 0x00, 0x01, …, 0x7e, 0x7f + # lengths 0x80..0x7fff are encoded as 0x8080, 0x8081, …, 0xfffe, 0xffff compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] prev_start = 0 @@ -108,19 +108,19 @@ def compress_normal(normal): entry.append(0x80 | (truelen >> 8)) entry.append(truelen & 0xFF) else: - entry.append(truelen & 0x7F) + entry.append(truelen) if falselen > 0x7F: entry.append(0x80 | (falselen >> 8)) entry.append(falselen & 0xFF) else: - entry.append(falselen & 0x7F) + entry.append(falselen) compressed.append(entry) return compressed -def print_singletons(uppers, lowers, uppersname, lowersname): +def print_singletons(name, uppers, lowers, uppersname, lowersname): print("#[rustfmt::skip]") print("const {}: &[(u8, u8)] = &[".format(uppersname)) for u, c in uppers: @@ -134,13 +134,15 @@ def print_singletons(uppers, lowers, uppersname, lowersname): ) print("];") + print(f"const {name}: Singletons = Singletons::new(&{uppersname}, &{lowersname});") + def print_normal(normal, normalname): print("#[rustfmt::skip]") - print("const {}: &[u8] = &[".format(normalname)) + print("const {}: Normal = Normal::new(&[".format(normalname)) for v in normal: print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) - print("];") + print("]);") def main(): @@ -178,8 +180,8 @@ def main(): else: normal0.append((a, b - a)) - singletons0u, singletons0l = compress_singletons(singletons0) - singletons1u, singletons1l = compress_singletons(singletons1) + singletons0_upper, singletons0_lower = compress_singletons(singletons0) + singletons1_upper, singletons1_lower = compress_singletons(singletons1) normal0 = compress_normal(normal0) normal1 = compress_normal(normal1) @@ -187,69 +189,41 @@ def main(): // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { - if lower == x as u8 { - return false; - } - } - } else if xupper < upper { - break; - } - lowerstart = lowerend; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 - } else { - v as i32 - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} +mod check; +use check::{Normal, Singletons}; pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else {\ + match x { + ..32 => false, // ASCII fast path. + ..127 => true, // ASCII fast path. + ..0x10000 => SINGLETONS0.check(lower) && NORMAL0.check(lower), + ..0x20000 => SINGLETONS1.check(lower) && NORMAL1.check(lower),\ """) for a, b in extra: - print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b)) - print(" return false;") - print(" }") + print(" {:#x}..{:#x} => false,".format(a, a + b)) print("""\ - true + _ => true, } }\ """) print() - print_singletons(singletons0u, singletons0l, "SINGLETONS0U", "SINGLETONS0L") - print_singletons(singletons1u, singletons1l, "SINGLETONS1U", "SINGLETONS1L") + print_singletons( + "SINGLETONS0", + singletons0_upper, + singletons0_lower, + "SINGLETONS0_UPPER", + "SINGLETONS0_LOWER", + ) + print_singletons( + "SINGLETONS1", + singletons1_upper, + singletons1_lower, + "SINGLETONS1_UPPER", + "SINGLETONS1_LOWER", + ) print_normal(normal0, "NORMAL0") print_normal(normal1, "NORMAL1") diff --git a/library/core/src/unicode/printable.rs b/library/core/src/unicode/printable.rs index d8fb50e4ed296..878ecbc436384 100644 --- a/library/core/src/unicode/printable.rs +++ b/library/core/src/unicode/printable.rs @@ -1,92 +1,34 @@ // NOTE: The following code was generated by "library/core/src/unicode/printable.py", // do not edit directly! -fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool { - let xupper = (x >> 8) as u8; - let mut lowerstart = 0; - for &(upper, lowercount) in singletonuppers { - let lowerend = lowerstart + lowercount as usize; - if xupper == upper { - for &lower in &singletonlowers[lowerstart..lowerend] { - if lower == x as u8 { - return false; - } - } - } else if xupper < upper { - break; - } - lowerstart = lowerend; - } - - let mut x = x as i32; - let mut normal = normal.iter().cloned(); - let mut current = true; - while let Some(v) = normal.next() { - let len = if v & 0x80 != 0 { - ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32 - } else { - v as i32 - }; - x -= len; - if x < 0 { - break; - } - current = !current; - } - current -} +mod check; +use check::{Normal, Singletons}; pub(crate) fn is_printable(x: char) -> bool { let x = x as u32; let lower = x as u16; - if x < 32 { - // ASCII fast path - false - } else if x < 127 { - // ASCII fast path - true - } else if x < 0x10000 { - check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0) - } else if x < 0x20000 { - check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1) - } else { - if 0x2a6e0 <= x && x < 0x2a700 { - return false; - } - if 0x2b73a <= x && x < 0x2b740 { - return false; - } - if 0x2b81e <= x && x < 0x2b820 { - return false; - } - if 0x2cea2 <= x && x < 0x2ceb0 { - return false; - } - if 0x2ebe1 <= x && x < 0x2ebf0 { - return false; - } - if 0x2ee5e <= x && x < 0x2f800 { - return false; - } - if 0x2fa1e <= x && x < 0x30000 { - return false; - } - if 0x3134b <= x && x < 0x31350 { - return false; - } - if 0x323b0 <= x && x < 0xe0100 { - return false; - } - if 0xe01f0 <= x && x < 0x110000 { - return false; - } - true + match x { + ..32 => false, // ASCII fast path. + ..127 => true, // ASCII fast path. + ..0x10000 => SINGLETONS0.check(lower) && NORMAL0.check(lower), + ..0x20000 => SINGLETONS1.check(lower) && NORMAL1.check(lower), + 0x2a6e0..0x2a700 => false, + 0x2b73a..0x2b740 => false, + 0x2b81e..0x2b820 => false, + 0x2cea2..0x2ceb0 => false, + 0x2ebe1..0x2ebf0 => false, + 0x2ee5e..0x2f800 => false, + 0x2fa1e..0x30000 => false, + 0x3134b..0x31350 => false, + 0x323b0..0xe0100 => false, + 0xe01f0..0x110000 => false, + _ => true, } } #[rustfmt::skip] -const SINGLETONS0U: &[(u8, u8)] = &[ +const SINGLETONS0_UPPER: &[(u8, u8)] = &[ (0x00, 1), (0x03, 5), (0x05, 6), @@ -129,7 +71,7 @@ const SINGLETONS0U: &[(u8, u8)] = &[ (0xff, 9), ]; #[rustfmt::skip] -const SINGLETONS0L: &[u8] = &[ +const SINGLETONS0_LOWER: &[u8] = &[ 0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90, 0x1c, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f, 0x5c, @@ -168,8 +110,9 @@ const SINGLETONS0L: &[u8] = &[ 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7, 0xfe, 0xff, ]; +const SINGLETONS0: Singletons = Singletons::new(&SINGLETONS0_UPPER, &SINGLETONS0_LOWER); #[rustfmt::skip] -const SINGLETONS1U: &[(u8, u8)] = &[ +const SINGLETONS1_UPPER: &[(u8, u8)] = &[ (0x00, 6), (0x01, 1), (0x03, 1), @@ -216,7 +159,7 @@ const SINGLETONS1U: &[(u8, u8)] = &[ (0xfb, 1), ]; #[rustfmt::skip] -const SINGLETONS1L: &[u8] = &[ +const SINGLETONS1_LOWER: &[u8] = &[ 0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x7b, 0x8b, 0x93, 0x96, 0xa2, 0xb2, 0xba, 0x86, 0xb1, 0x06, 0x07, 0x09, 0x36, 0x3d, @@ -244,8 +187,9 @@ const SINGLETONS1L: &[u8] = &[ 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0, 0xae, 0xaf, 0x6e, 0x6f, 0xdd, 0xde, 0x93, ]; +const SINGLETONS1: Singletons = Singletons::new(&SINGLETONS1_UPPER, &SINGLETONS1_LOWER); #[rustfmt::skip] -const NORMAL0: &[u8] = &[ +const NORMAL0: Normal = Normal::new(&[ 0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, @@ -379,9 +323,9 @@ const NORMAL0: &[u8] = &[ 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d, -]; +]); #[rustfmt::skip] -const NORMAL1: &[u8] = &[ +const NORMAL1: Normal = Normal::new(&[ 0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, @@ -601,4 +545,4 @@ const NORMAL1: &[u8] = &[ 0x1c, 0x06, 0x09, 0x07, 0x80, 0xfa, 0x84, 0x06, -]; +]); diff --git a/library/core/src/unicode/printable/check.rs b/library/core/src/unicode/printable/check.rs new file mode 100644 index 0000000000000..d62a57950448e --- /dev/null +++ b/library/core/src/unicode/printable/check.rs @@ -0,0 +1,147 @@ +use crate::{iter, slice}; + +/// A compact representation of Unicode singletons. +/// +/// This is basically a `&[u16]`, but represented as `&[(u8, &[u8])]`, +/// i.e. pairs of upper bytes and multiple corresponding lower bytes. +/// +/// However, in order to reduce the pointer-sized overhead for each nested +/// slice, it is compacted again into `&[(u8, u8)]` with the length of the +/// lower bytes in the second byte, and a separate, contiguous `&[u8]` for +/// storing the lower bytes. +pub(super) struct Singletons { + upper: &'static [(u8, u8)], + lower: &'static [u8], +} + +impl Singletons { + /// Creates a new `Singletons` instance from compacted upper and lower bytes. + /// + /// # Panics + /// + /// Panics if the sum of all lengths (i.e. the second field of each pair) in `upper` + /// is not equal to the length of `lower`. + pub(super) const fn new(upper: &'static [(u8, u8)], lower: &'static [u8]) -> Self { + let mut lower_count_total = 0; + let mut i = 0; + while i < upper.len() { + lower_count_total += upper[i].1 as usize; + i += 1; + } + assert!( + lower_count_total == lower.len(), + "Sum of lengths in `upper` does not match `lower` length." + ); + + Self { upper, lower } + } + + #[inline] + fn iter(&self) -> SingletonsIter { + SingletonsIter { iter: self.upper.iter().cloned(), lower: self.lower, lower_start: 0 } + } + + pub(super) fn check(&self, x: u16) -> bool { + let [x_upper, x_lower] = x.to_be_bytes(); + for (upper, lowers) in self.iter() { + if upper == x_upper { + for &lower in lowers { + if lower == x_lower { + return false; + } + } + } else if x_upper < upper { + break; + } + } + + true + } +} + +struct SingletonsIter { + iter: iter::Cloned>, + lower: &'static [u8], + lower_start: usize, +} + +impl Iterator for SingletonsIter { + type Item = (u8, &'static [u8]); + + fn next(&mut self) -> Option { + let (upper, lower_count) = self.iter.next()?; + + let lower_start = self.lower_start; + let lower_end = lower_start + lower_count as usize; + self.lower_start = lower_end; + + // SAFETY: The invariant for `Singletons` guarantees that the sum of all lengths + // in `upper` must be equal to the lengths of `lower`, so `lower_end` is guaranteed + // to be in range. + let lowers = unsafe { self.lower.get_unchecked(lower_start..lower_end) }; + + Some((upper, lowers)) + } +} + +/// A compact representation of lengths. +pub(super) struct Normal(&'static [u8]); + +impl Normal { + pub(super) const fn new(normal: &'static [u8]) -> Self { + // Invariant: Lengths greater than `0x7f` must be encoded as two bytes, + // with the length contained in the remaining 15 bits, i.e. `0x7fff`. + { + let mut i = 0; + while i < normal.len() { + if normal[i] & 0b1000_0000 != 0 { + assert!( + i + 1 < normal.len(), + "Length greater than `0x7f` is not encoded as two bytes." + ); + i += 2; + } else { + i += 1; + } + } + } + + Self(normal) + } + + #[inline] + fn iter(&self) -> NormalIter { + NormalIter { iter: self.0.iter().cloned() } + } + + pub(super) fn check(&self, mut x: u16) -> bool { + let mut current = true; + for len in self.iter() { + x = if let Some(x) = x.checked_sub(len) { x } else { break }; + current = !current; + } + current + } +} + +struct NormalIter { + iter: iter::Cloned>, +} + +impl Iterator for NormalIter { + type Item = u16; + + fn next(&mut self) -> Option { + let len = self.iter.next()?; + + Some(if len & 0b1000_0000 != 0 { + let upper = len & 0b0111_1111; + // SAFETY: The invariant of `Normal` guarantees that lengths are encoded + // as two bytes if greater than `0x7f`, so there must be a next byte. + let lower = unsafe { self.iter.next().unwrap_unchecked() }; + u16::from_be_bytes([upper, lower]) + } else { + u16::from(len) + }) + } +}