Skip to content

Commit 614db48

Browse files
committed
Change in uppercase of ß
1 parent d981934 commit 614db48

File tree

6 files changed

+56
-24
lines changed

6 files changed

+56
-24
lines changed

Project.toml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
1-
name = "StrBase"
2-
uuid = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
1+
name = "StrBase"
2+
desc = "Basic functionality for Str types"
3+
authors = ["ScottPJones <[email protected]>"]
34
keywords = ["Strings"]
4-
license = "MIT"
5-
desc = "Basic functionality for Str types"
6-
authors = ["ScottPJones <[email protected]>"]
7-
version = "0.1.6"
5+
license = "MIT"
6+
uuid = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
7+
version = "0.1.9"
88

99
[deps]
10-
CharSetEncodings = "cb9422de-a9d8-5b68-86db-ff05833ab307"
11-
ChrBase = "c13fa7b1-fb91-5a40-8b3c-3aad7fd30002"
10+
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
11+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
12+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
13+
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
14+
1215
ModuleInterfaceTools = "5cb8414e-7aab-5a03-a681-351269c074bf"
16+
CharSetEncodings = "cb9422de-a9d8-5b68-86db-ff05833ab307"
1317
MurmurHash3 = "b10b62ed-fbae-5ea5-b934-abaf0477b71d"
14-
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
15-
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
18+
1619
StrAPI = "69e7dfc3-c4d0-5e14-8d95-d6042a05b383"
17-
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
18-
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
20+
ChrBase = "c13fa7b1-fb91-5a40-8b3c-3aad7fd30002"

REQUIRE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ MurmurHash3 0.1.5
33
ModuleInterfaceTools 0.1.6
44
StrAPI 0.1.7
55
CharSetEncodings 0.1.8
6-
ChrBase 0.1.4
6+
ChrBase 0.1.5

src/casefold.jl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,12 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
109109
end
110110
end
111111

112+
@static if V6_COMPAT
113+
_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff)
114+
else
115+
_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (ch == 0xdf)
116+
end
117+
112118
# Special handling for characters that can't map into Latin1
113119
function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
114120
(len = ncodeunits(str)) == 0 && return str
@@ -120,9 +126,9 @@ function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
120126
set_codeunit!(out8, ch - 0x20)
121127
len > 1 && unsafe_copyto!(out8, pnt+1, len-1)
122128
Str(C, buf)
123-
elseif (ch == 0xb5) | (ch == 0xff)
129+
elseif _wide_lower_latin(ch)
124130
buf, out = _allocate(UInt16, len)
125-
set_codeunit!(out, ifelse(ch == 0xb5, 0x39c, 0x178))
131+
set_codeunit!(out, ifelse(ch == 0xb5, 0x39c, ifelse(ch == 0xff, 0x178, 0x1e9e)))
126132
# Perform the widen operation on the rest (should be done via SIMD)
127133
@inbounds for i = 2:len
128134
set_codeunit!(out += 2, get_codeunit(pnt += 2)%UInt16)
@@ -152,7 +158,7 @@ function _upper(::Type{C}, beg::Ptr{UInt8}, off, len) where {C<:_LatinCSE}
152158
cur = beg + off
153159
# Need to scan the rest of the string to see if _widenupper needs to be called
154160
while cur < fin
155-
((ch = get_codeunit(cur)) == 0xb5) | (ch == 0xff) && return _widenupper(beg, off, len)
161+
_wide_lower_latin(get_codeunit(cur)) && return _widenupper(beg, off, len)
156162
cur += 1
157163
end
158164
buf, out = _allocate(UInt8, len)
@@ -189,6 +195,8 @@ function _widenupper(beg::Ptr{UInt8}, off, len)
189195
set_codeunit!(out, 0x39c)
190196
elseif ch == 0xff
191197
set_codeunit!(out, 0x178)
198+
elseif !V6_COMPAT && (ch == 0xdf)
199+
set_codeunit!(out, 0x1e9e)
192200
else
193201
set_codeunit!(out, _can_upper(ch) ? ch - 0x20 : ch)
194202
end
@@ -218,7 +226,7 @@ function uppercase(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
218226
fin = beg + len
219227
while pnt < fin
220228
ch = get_codeunit(pnt)
221-
((ch == 0xb5) | (ch == 0xff)) && return _widenupper(beg, pnt-beg, len)
229+
_wide_lower_latin(ch) && return _widenupper(beg, pnt-beg, len)
222230
_can_upper(ch) && return _upper(C, beg, pnt-beg, len)
223231
pnt += 1
224232
end
@@ -336,6 +344,8 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
336344
set_codeunit!(out, 0x39c)
337345
elseif ch == 0xff
338346
set_codeunit!(out, 0x178)
347+
elseif !V6_COMPAT && ch == 0xdf
348+
set_codeunit!(out, 0x1e9e)
339349
end
340350
out += sizeof(CU)
341351
end

src/unicode.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,18 @@ Licensed under MIT License, see LICENSE.md
88
# Recommended by deprecate
99
@static if V6_COMPAT
1010
text_width(str::AbstractString) = strwidth(str)
11+
text_width(str::Str) = mapreduce(text_width, +, 0, str)
1112

1213
import Base: normalize_string
1314
Base.normalize_string(str::Str, opt::Symbol) = normalize(str, opt)
1415
Base.strwidth(str::Str) = text_width(str)
1516
else
17+
text_width(str::Str) = mapreduce(text_width, +, str; init=0)
1618
Base.Unicode.normalize(str::Str, opt::Symbol) = normalize(str, opt)
1719
end
1820

1921
############################################################################
2022

21-
text_width(str::Str) = mapreduce(text_width, +, 0, str)
2223
text_width(str::Str{Union{ASCIICSE,Latin_CSEs}}) = length(str)
2324

2425
############################################################################

src/utf16case.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ function _upper(::Type{<:Str{UTF16CSE}}, beg, off, len)
6969
set_codeunit!(out, 0x39c)
7070
elseif ch == 0xff
7171
set_codeunit!(out, 0x178)
72+
elseif !V6_COMPAT && ch == 0xdf
73+
set_codeunit!(out, 0x1e9e)
7274
end
7375
elseif is_surrogate_trail(ch)
7476
# pick up previous code unit (lead surrogate)

src/utf8case.jl

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,17 @@ function _upper_utf8(beg, off, len)
9191
out += 1
9292
elseif ch < 0xc4
9393
ch = (ch << 6) | (get_codeunit(pnt += 1) & 0x3f)
94-
c16 = ch == 0xb5 ? 0x39c : (ch == 0xff ? 0x178 : (ch - _can_upper_l(ch)<<5)%UInt16)
94+
if _can_upper_l(ch)
95+
c16 = (ch - 0x20)%UInt16
96+
elseif ch == 0xb5
97+
c16 = 0x39c
98+
elseif ch == 0xff
99+
c16 = 0x178
100+
elseif !V6_COMPAT && ch == 0xdf
101+
c16 = 0x1e9e
102+
else
103+
c16 = ch%UInt16
104+
end
95105
out = output_utf8_2byte!(out, c16)
96106
elseif ch < 0xe0
97107
# 2 byte
@@ -167,6 +177,13 @@ function lowercase(str::Str{UTF8CSE})
167177
end
168178
end
169179

180+
# Check if can be uppercased
181+
@inline function _check_uppercase(ch, pnt)
182+
# ch < 0xc2 && return false (not needed, validated UTF-8 string)
183+
cont = get_codeunit(pnt)
184+
ch == 0xc3 ? ((cont > (V6_COMPAT ? 0x9f : 0x9e)) & (cont != 0xb7)) : (cont == 0xb5)
185+
end
186+
170187
function uppercase(str::Str{UTF8CSE})
171188
@preserve str begin
172189
pnt = beg = pointer(str)
@@ -176,14 +193,14 @@ function uppercase(str::Str{UTF8CSE})
176193
prv = pnt
177194
(ch < 0x80
178195
? _islower_a(ch)
179-
: (ch < 0xc4
180-
? _can_upper_only_latin((ch << 6) | (get_codeunit(pnt += 1) & 0x3f))
181-
: _islower_u(ch >= 0xf0
196+
: (ch > 0xc3
197+
? _islower_u(ch >= 0xf0
182198
? get_utf8_4byte(pnt += 3, ch)
183199
: (ch < 0xe0
184200
? get_utf8_2byte(pnt += 1, ch)
185-
: get_utf8_3byte(pnt += 2, ch))%UInt32))) &&
186-
return _upper_utf8(beg, prv-beg, ncodeunits(str))
201+
: get_utf8_3byte(pnt += 2, ch))%UInt32)
202+
: _check_uppercase(ch, pnt += 1))) &&
203+
return _upper_utf8(beg, prv-beg, ncodeunits(str))
187204
pnt += 1
188205
end
189206
str

0 commit comments

Comments
 (0)