Skip to content

Commit 37557e4

Browse files
author
bors-servo
authored
Auto merge of #365 - froydnj:uts-data-slimming, r=SimonSapin
more idna data slimming We can do a better job of packing the uts46 data: * We can merge identically-mapped entries that don't have an associated string slice. This saves ~10% space. * We can make slices smaller and pack them into `Mapping` better, which saves 25% space on 64-bit platforms. I think it might save half that on 32-bit platforms, but I didn't check. Together these are good for ~42KB of space savings on a 64-bit platform. <!-- Reviewable:start --> --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-url/365) <!-- Reviewable:end -->
2 parents d19d5d0 + c018150 commit 37557e4

File tree

3 files changed

+6015
-6700
lines changed

3 files changed

+6015
-6700
lines changed

idna/src/make_uts46_mapping_table.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,15 @@ def strtab_slice(s):
5151
return c
5252

5353
def rust_slice(s):
54-
return "(StringTableSlice { byte_start: %d, byte_len: %d })" % s
54+
start = s[0]
55+
length = s[1]
56+
start_lo = start & 0xff
57+
start_hi = start >> 8
58+
assert length <= 255
59+
assert start_hi <= 255
60+
return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
61+
62+
ranges = []
5563

5664
for line in txt:
5765
# remove comments
@@ -66,12 +74,58 @@ def rust_slice(s):
6674
if not last:
6775
last = first
6876
mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
77+
unicode_str = None
6978
if len(fields) > 2:
7079
if fields[2].strip():
7180
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
72-
mapping += rust_slice(strtab_slice(unicode_str))
7381
elif mapping == "Deviation":
74-
mapping += rust_slice(strtab_slice(''))
82+
unicode_str = u''
83+
ranges.append((first, last, mapping, unicode_str))
84+
85+
def mergeable_key(r):
86+
mapping = r[2]
87+
# These types have associated data, so we should not merge them.
88+
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
89+
return r
90+
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
91+
return mapping
92+
93+
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
94+
95+
optimized_ranges = []
96+
97+
for (k, g) in grouped_ranges:
98+
group = list(g)
99+
if len(group) == 1:
100+
optimized_ranges.append(group[0])
101+
continue
102+
# Assert that nothing in the group has an associated unicode string.
103+
for g in group:
104+
if len(g[3]) > 2:
105+
assert not g[3][2].strip()
106+
# Assert that consecutive members of the group don't leave gaps in
107+
# the codepoint space.
108+
a, b = itertools.tee(group)
109+
next(b, None)
110+
for (g1, g2) in itertools.izip(a, b):
111+
last_char = int(g1[1], 16)
112+
next_char = int(g2[0], 16)
113+
if last_char + 1 == next_char:
114+
continue
115+
# There's a gap where surrogates would appear, but we don't have to
116+
# worry about that gap, as surrogates never appear in Rust strings.
117+
# Assert we're seeing the surrogate case here.
118+
assert last_char == 0xd7ff
119+
assert next_char == 0xe000
120+
first = group[0][0]
121+
last = group[-1][1]
122+
mapping = group[0][2]
123+
unicode_str = group[0][3]
124+
optimized_ranges.append((first, last, mapping, unicode_str))
125+
126+
for (first, last, mapping, unicode_str) in optimized_ranges:
127+
if unicode_str is not None:
128+
mapping += rust_slice(strtab_slice(unicode_str))
75129
print(" Range { from: '%s', to: '%s', mapping: %s }," % (escape_char(char(first)),
76130
escape_char(char(last)),
77131
mapping))

idna/src/uts46.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,22 @@ include!("uts46_mapping_table.rs");
2121

2222
#[derive(Debug)]
2323
struct StringTableSlice {
24-
byte_start: u16,
25-
byte_len: u16,
24+
// Store these as separate fields so the structure will have an
25+
// alignment of 1 and thus pack better into the Mapping enum, below.
26+
byte_start_lo: u8,
27+
byte_start_hi: u8,
28+
byte_len: u8,
2629
}
2730

2831
fn decode_slice(slice: &StringTableSlice) -> &'static str {
29-
let start = slice.byte_start as usize;
32+
let lo = slice.byte_start_lo as usize;
33+
let hi = slice.byte_start_hi as usize;
34+
let start = (hi << 8) | lo;
3035
let len = slice.byte_len as usize;
3136
&STRING_TABLE[start..(start + len)]
3237
}
3338

34-
#[repr(u16)]
39+
#[repr(u8)]
3540
#[derive(Debug)]
3641
enum Mapping {
3742
Valid,

0 commit comments

Comments
 (0)