Change in uppercase of ß

ScottPJones · ScottPJones · commit 614db48e7aeb · 2018-10-02T11:51:25.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,18 +1,20 @@
-name = "StrBase"
-uuid = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
+name     = "StrBase"
+desc     = "Basic functionality for Str types"
+authors  = ["ScottPJones <scottjones@alum.mit.edu>"]
 keywords = ["Strings"]
-license = "MIT"
-desc = "Basic functionality for Str types"
-authors = ["ScottPJones <scottjones@alum.mit.edu>"]
-version = "0.1.6"
+license  = "MIT"
+uuid     = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
+version  = "0.1.9"
 
 [deps]
-CharSetEncodings = "cb9422de-a9d8-5b68-86db-ff05833ab307"
-ChrBase = "c13fa7b1-fb91-5a40-8b3c-3aad7fd30002"
+Pkg     = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Test    = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Random  = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
 ModuleInterfaceTools = "5cb8414e-7aab-5a03-a681-351269c074bf"
+CharSetEncodings = "cb9422de-a9d8-5b68-86db-ff05833ab307"
 MurmurHash3 = "b10b62ed-fbae-5ea5-b934-abaf0477b71d"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
 StrAPI = "69e7dfc3-c4d0-5e14-8d95-d6042a05b383"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+ChrBase = "c13fa7b1-fb91-5a40-8b3c-3aad7fd30002"
diff --git a/REQUIRE b/REQUIRE
@@ -3,4 +3,4 @@ MurmurHash3 0.1.5
 ModuleInterfaceTools 0.1.6
 StrAPI 0.1.7
 CharSetEncodings 0.1.8
-ChrBase 0.1.4
+ChrBase 0.1.5
diff --git a/src/casefold.jl b/src/casefold.jl
@@ -109,6 +109,12 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
     end
 end
 
+@static if V6_COMPAT
+    _wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff)
+else
+    _wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (ch == 0xdf)
+end
+
 # Special handling for characters that can't map into Latin1
 function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
     (len = ncodeunits(str)) == 0 && return str
@@ -120,9 +126,9 @@ function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
             set_codeunit!(out8, ch - 0x20)
             len > 1 && unsafe_copyto!(out8, pnt+1, len-1)
             Str(C, buf)
-        elseif (ch == 0xb5) | (ch == 0xff)
+        elseif _wide_lower_latin(ch)
             buf, out = _allocate(UInt16, len)
-            set_codeunit!(out, ifelse(ch == 0xb5, 0x39c, 0x178))
+            set_codeunit!(out, ifelse(ch == 0xb5, 0x39c, ifelse(ch == 0xff, 0x178, 0x1e9e)))
             # Perform the widen operation on the rest (should be done via SIMD)
             @inbounds for i = 2:len
                 set_codeunit!(out += 2, get_codeunit(pnt += 2)%UInt16)
@@ -152,7 +158,7 @@ function _upper(::Type{C}, beg::Ptr{UInt8}, off, len) where {C<:_LatinCSE}
     cur = beg + off
     # Need to scan the rest of the string to see if _widenupper needs to be called
     while cur < fin
-        ((ch = get_codeunit(cur)) == 0xb5) | (ch == 0xff) && return _widenupper(beg, off, len)
+        _wide_lower_latin(get_codeunit(cur)) && return _widenupper(beg, off, len)
         cur += 1
     end
     buf, out = _allocate(UInt8, len)
@@ -189,6 +195,8 @@ function _widenupper(beg::Ptr{UInt8}, off, len)
             set_codeunit!(out, 0x39c)
         elseif ch == 0xff
             set_codeunit!(out, 0x178)
+        elseif !V6_COMPAT && (ch == 0xdf)
+            set_codeunit!(out, 0x1e9e)
         else
             set_codeunit!(out, _can_upper(ch) ? ch - 0x20 : ch)
         end
@@ -218,7 +226,7 @@ function uppercase(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
         fin = beg + len
         while pnt < fin
             ch = get_codeunit(pnt)
-            ((ch == 0xb5) | (ch == 0xff)) && return _widenupper(beg, pnt-beg, len)
+            _wide_lower_latin(ch) && return _widenupper(beg, pnt-beg, len)
             _can_upper(ch) && return _upper(C, beg, pnt-beg, len)
             pnt += 1
         end
@@ -336,6 +344,8 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
             set_codeunit!(out, 0x39c)
         elseif ch == 0xff
             set_codeunit!(out, 0x178)
+        elseif !V6_COMPAT && ch == 0xdf
+            set_codeunit!(out, 0x1e9e)
         end
         out += sizeof(CU)
     end
diff --git a/src/unicode.jl b/src/unicode.jl
@@ -8,17 +8,18 @@ Licensed under MIT License, see LICENSE.md
 # Recommended by deprecate
 @static if V6_COMPAT
     text_width(str::AbstractString) = strwidth(str)
+    text_width(str::Str) = mapreduce(text_width, +, 0, str)
 
     import Base: normalize_string
     Base.normalize_string(str::Str, opt::Symbol) = normalize(str, opt)
     Base.strwidth(str::Str) = text_width(str)
 else
+    text_width(str::Str) = mapreduce(text_width, +, str; init=0)
     Base.Unicode.normalize(str::Str, opt::Symbol) = normalize(str, opt)
 end
 
 ############################################################################
 
-text_width(str::Str) = mapreduce(text_width, +, 0, str)
 text_width(str::Str{Union{ASCIICSE,Latin_CSEs}}) = length(str)
 
 ############################################################################
diff --git a/src/utf16case.jl b/src/utf16case.jl
@@ -69,6 +69,8 @@ function _upper(::Type{<:Str{UTF16CSE}}, beg, off, len)
                 set_codeunit!(out, 0x39c)
             elseif ch == 0xff
                 set_codeunit!(out, 0x178)
+            elseif !V6_COMPAT && ch == 0xdf
+                set_codeunit!(out, 0x1e9e)
             end
         elseif is_surrogate_trail(ch)
             # pick up previous code unit (lead surrogate)
diff --git a/src/utf8case.jl b/src/utf8case.jl
@@ -91,7 +91,17 @@ function _upper_utf8(beg, off, len)
             out += 1
         elseif ch < 0xc4
             ch = (ch << 6) | (get_codeunit(pnt += 1) & 0x3f)
-            c16 = ch == 0xb5 ? 0x39c : (ch == 0xff ? 0x178 : (ch - _can_upper_l(ch)<<5)%UInt16)
+            if _can_upper_l(ch)
+                c16 = (ch - 0x20)%UInt16
+            elseif ch == 0xb5
+                c16 = 0x39c
+            elseif ch == 0xff
+                c16 = 0x178
+            elseif !V6_COMPAT && ch == 0xdf
+                c16 = 0x1e9e
+            else
+                c16 = ch%UInt16
+            end
             out = output_utf8_2byte!(out, c16)
         elseif ch < 0xe0
             # 2 byte
@@ -167,6 +177,13 @@ function lowercase(str::Str{UTF8CSE})
     end
 end
 
+# Check if can be uppercased
+@inline function _check_uppercase(ch, pnt)
+    # ch < 0xc2 && return false (not needed, validated UTF-8 string)
+    cont = get_codeunit(pnt)
+    ch == 0xc3 ? ((cont > (V6_COMPAT ? 0x9f : 0x9e)) & (cont != 0xb7)) : (cont == 0xb5)
+end
+
 function uppercase(str::Str{UTF8CSE})
     @preserve str begin
         pnt = beg = pointer(str)
@@ -176,14 +193,14 @@ function uppercase(str::Str{UTF8CSE})
             prv = pnt
             (ch < 0x80
              ? _islower_a(ch)
-             : (ch < 0xc4
-                ? _can_upper_only_latin((ch << 6) | (get_codeunit(pnt += 1) & 0x3f))
-                : _islower_u(ch >= 0xf0
+             : (ch > 0xc3
+                ? _islower_u(ch >= 0xf0
                              ? get_utf8_4byte(pnt += 3, ch)
                              : (ch < 0xe0
                                 ? get_utf8_2byte(pnt += 1, ch)
-                                : get_utf8_3byte(pnt += 2, ch))%UInt32))) &&
-                                    return _upper_utf8(beg, prv-beg, ncodeunits(str))
+                                : get_utf8_3byte(pnt += 2, ch))%UInt32)
+                : _check_uppercase(ch, pnt += 1))) &&
+                    return _upper_utf8(beg, prv-beg, ncodeunits(str))
             pnt += 1
         end
         str