feat(link-helper): improve ASCII normalization handling (#911)

mos379 · web-flow · commit 12590e5fbe17 · 2025-11-10T22:01:35.000+01:00
* feat(link-helper): improve ASCII normalization handling

Enhanced the `Urilize` method to better handle ASCII normalization and special characters. Added support for decomposing characters when `allowOnlyAscii` is true and skipping diacritical marks. Introduced handling for special German, Scandinavian, and Icelandic characters via new helper methods: `IsSpecialScandinavianOrGermanChar` and `NormalizeScandinavianOrGermanChar`.

Reorganized `using` directives for better clarity. Updated the processing loop in `Urilize` to handle normalized spans and ASCII equivalents more effectively. These changes improve link generation compatibility across various languages.

* Add tests for Scandinavian and German character normalization

Added tests for NormalizeScandinavianOrGermanChar method to validate character normalization for various special characters in both ASCII and non-ASCII contexts.

* test(link-helper): update ASCII transliteration tests

Updated test cases in `TestUrilizeOnlyAscii_Simple` to reflect
changes in `LinkHelper.Urilize` behavior. Non-ASCII characters
like `æ` and `ø` are now transliterated to their ASCII
equivalents (`ae` and `oe`) instead of being removed.
diff --git a/src/Markdig.Tests/TestLinkHelper.cs b/src/Markdig.Tests/TestLinkHelper.cs
@@ -327,8 +327,8 @@ public void TestUrilizeOnlyAscii_Simple(string input, string expectedResult)
         Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
     }
 
-    [TestCase("bær", "br")]
-    [TestCase("bør", "br")]
+    [TestCase("bær", "baer")]
+    [TestCase("bør", "boer")]
     [TestCase("bΘr", "br")]
     [TestCase("四五", "")]
     public void TestUrilizeOnlyAscii_NonAscii(string input, string expectedResult)
@@ -343,6 +343,75 @@ public void TestUrilizeOnlyAscii_Normalization(string input, string expectedResu
         Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
     }
 
+    // Tests for NormalizeScandinavianOrGermanChar method mappings
+    // These special characters are always normalized (both allowOnlyAscii=true and false)
+    // 
+    // Note: When allowOnlyAscii=true, NFD (Canonical Decomposition) is applied first:
+    // - German umlauts ä,ö,ü decompose to base letter + combining mark (ü -> u + ¨)
+    //   The combining mark is then stripped, leaving just the base letter (ü -> u)
+    // - å decomposes similarly (å -> a + ˚ -> a)
+    // - But ø, æ, ß, þ, ð do NOT decompose, so they use NormalizeScandinavianOrGermanChar
+    //
+    // When allowOnlyAscii=false, NormalizeScandinavianOrGermanChar is used for ALL special chars
+    
+    // German ß (Eszett/sharp s) - does NOT decompose with NFD
+    [TestCase("Straße", "strasse")]    // ß -> ss (both allowOnlyAscii=true and false)
+    
+    // Scandinavian æ, ø - do NOT decompose with NFD
+    [TestCase("æble", "aeble")]        // æ -> ae (both modes)
+    [TestCase("Ærø", "aeroe")]         // Æ -> Ae, ø -> oe (both modes, then lowercase)
+    [TestCase("København", "koebenhavn")] // ø -> oe (both modes)
+    [TestCase("Øresund", "oeresund")]  // Ø -> Oe (both modes, then lowercase)
+    
+    // Icelandic þ, ð - do NOT decompose with NFD
+    [TestCase("þing", "thing")]        // þ (thorn) -> th (both modes)
+    [TestCase("bað", "bad")]           // ð (eth) -> d (both modes)
+    
+    // Mixed special characters (only chars that behave same in both modes)
+    [TestCase("øst-æble", "oest-aeble")] // ø->oe, æ->ae (both modes)
+    public void TestUrilizeScandinavianGermanChars(string input, string expectedResult)
+    {
+        // These transformations apply regardless of allowOnlyAscii flag
+        Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
+        Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
+    }
+    
+    // Tests specific to allowOnlyAscii=true behavior
+    // German umlauts (ä, ö, ü) and å decompose with NFD, so they become base letter only
+    [TestCase("schön", "schon")]       // ö decomposes to o (NFD strips combining mark)
+    [TestCase("Mädchen", "madchen")]   // ä decomposes to a
+    [TestCase("Übung", "ubung")]       // Ü decomposes to U (then lowercase to u)
+    [TestCase("Düsseldorf", "dusseldorf")] // ü decomposes to u
+    [TestCase("Käse", "kase")]         // ä decomposes to a
+    [TestCase("gå", "ga")]             // å decomposes to a
+    [TestCase("Ålesund", "alesund")]   // Å decomposes to A (then lowercase)
+    [TestCase("grüßen", "grussen")]    // ü decomposes to u, ß -> ss
+    [TestCase("Þór", "thor")]          // Þ -> Th, ó decomposes to o (then lowercase)
+    [TestCase("Íslandsbanki", "islandsbanki")] // Í decomposes to I (then lowercase)
+    public void TestUrilizeOnlyAscii_GermanUmlautsDecompose(string input, string expectedResult)
+    {
+        // With allowOnlyAscii=true, these characters decompose via NFD and lose their diacritics
+        Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
+    }
+    
+    // Tests specific to allowOnlyAscii=false behavior
+    // All special chars use NormalizeScandinavianOrGermanChar (including ä, ö, ü, å)
+    [TestCase("schön", "schoen")]      // ö -> oe (NormalizeScandinavianOrGermanChar)
+    [TestCase("Mädchen", "maedchen")]  // ä -> ae
+    [TestCase("Übung", "uebung")]      // Ü -> Ue (then lowercase)
+    [TestCase("Düsseldorf", "duesseldorf")] // ü -> ue
+    [TestCase("Käse", "kaese")]        // ä -> ae
+    [TestCase("gå", "gaa")]            // å -> aa
+    [TestCase("Ålesund", "aalesund")]  // Å -> Aa (then lowercase)
+    [TestCase("grüßen", "gruessen")]   // ü -> ue, ß -> ss
+    [TestCase("Þór", "thór")]          // Þ -> Th (then lowercase 'th'), ó is kept as-is
+    [TestCase("Íslandsbanki", "íslandsbanki")] // í is kept as-is when allowOnlyAscii=false
+    public void TestUrilizeNonAscii_GermanUmlautsExpanded(string input, string expectedResult)
+    {
+        // With allowOnlyAscii=false, these characters use NormalizeScandinavianOrGermanChar
+        Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
+    }
+    
     [TestCase("123", "")]
     [TestCase("1,-b", "b")]
     [TestCase("b1,-", "b1")] // Not Pandoc equivalent: b1-
@@ -360,11 +429,11 @@ public void TestUrilizeNonAscii_NonAsciiNumeric(string input, string expectedRes
         Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
     }
 
-    [TestCase("bær", "bær")]
-    [TestCase("æ5el", "æ5el")]
-    [TestCase("-æ5el", "æ5el")]
-    [TestCase("-frø-", "frø")]
-    [TestCase("-fr-ø", "fr-ø")]
+    [TestCase("bær", "baer")]
+    [TestCase("æ5el", "ae5el")]
+    [TestCase("-æ5el", "ae5el")]
+    [TestCase("-frø-", "froe")]
+    [TestCase("-fr-ø", "fr-oe")]
     public void TestUrilizeNonAscii_Simple(string input, string expectedResult)
     {
         Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
@@ -393,4 +462,4 @@ public void TestUnicodeInDomainNameOfLinkReferenceDefinition()
     {
         TestParser.TestSpec("[Foo]\n\n[Foo]: http://ünicode.com", "<p><a href=\"http://xn--nicode-2ya.com\">Foo</a></p>");
     }
-}
+}
diff --git a/src/Markdig/Helpers/LinkHelper.cs b/src/Markdig/Helpers/LinkHelper.cs
@@ -2,11 +2,13 @@
 // This file is licensed under the BSD-Clause 2 license. 
 // See the license.txt file in the project root for more information.
 
+using Markdig.Syntax;
 using System.Buffers;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
+using System.Globalization;
 using System.Runtime.CompilerServices;
-using Markdig.Syntax;
+using System.Text;
 
 namespace Markdig.Helpers;
 
@@ -30,11 +32,38 @@ public static string Urilize(ReadOnlySpan<char> headingText, bool allowOnlyAscii
         var headingBuffer = new ValueStringBuilder(stackalloc char[ValueStringBuilder.StackallocThreshold]);
         bool hasLetter = keepOpeningDigits && headingText.Length > 0 && char.IsLetterOrDigit(headingText[0]);
         bool previousIsSpace = false;
-        for (int i = 0; i < headingText.Length; i++)
+
+        // First normalize the string to decompose characters if allowOnlyAscii is true
+        string normalizedString = string.Empty;
+        if (allowOnlyAscii)
         {
-            var c = headingText[i];
-            var normalized = allowOnlyAscii ? CharNormalizer.ConvertToAscii(c) : null;
-            for (int j = 0; j < (normalized?.Length ?? 1); j++)
+            normalizedString = headingText.ToString().Normalize(NormalizationForm.FormD);
+        }
+
+        var textToProcess = string.IsNullOrEmpty(normalizedString) ? headingText : normalizedString.AsSpan();
+
+        for (int i = 0; i < textToProcess.Length; i++)
+        {
+            var c = textToProcess[i];
+
+            // Skip combining diacritical marks when normalized
+            if (allowOnlyAscii && CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
+            {
+                continue;
+            }
+
+            // Handle German umlauts and Norwegian/Danish characters explicitly (they don't decompose properly)
+            ReadOnlySpan<char> normalized;
+            if (IsSpecialScandinavianOrGermanChar(c))
+            {
+                normalized = NormalizeScandinavianOrGermanChar(c);
+            }
+            else
+            {
+                normalized = allowOnlyAscii ? CharNormalizer.ConvertToAscii(c) : null;
+            }
+
+            for (int j = 0; j < (normalized.Length < 1 ? 1 : normalized.Length); j++)
             {
                 if (normalized != null)
                 {
@@ -101,6 +130,50 @@ public static string Urilize(ReadOnlySpan<char> headingText, bool allowOnlyAscii
         return headingBuffer.ToString();
     }
 
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static bool IsSpecialScandinavianOrGermanChar(char c)
+    {
+        // German umlauts and ß
+        // Norwegian/Danish/Swedish æ, ø, å
+        // Icelandic þ (thorn), ð (eth)
+        return c == 'ä' || c == 'ö' || c == 'ü' ||
+               c == 'Ä' || c == 'Ö' || c == 'Ü' ||
+               c == 'ß' ||
+               c == 'æ' || c == 'ø' || c == 'å' ||
+               c == 'Æ' || c == 'Ø' || c == 'Å' ||
+               c == 'þ' || c == 'ð' ||
+               c == 'Þ' || c == 'Ð';
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static ReadOnlySpan<char> NormalizeScandinavianOrGermanChar(char c)
+    {
+        return c switch
+        {
+            // German
+            'ä' => "ae",
+            'ö' => "oe",
+            'ü' => "ue",
+            'Ä' => "Ae",
+            'Ö' => "Oe",
+            'Ü' => "Ue",
+            'ß' => "ss",
+            // Norwegian/Danish/Swedish
+            'æ' => "ae",
+            'ø' => "oe",
+            'å' => "aa",
+            'Æ' => "Ae",
+            'Ø' => "Oe",
+            'Å' => "Aa",
+            // Icelandic
+            'þ' => "th",
+            'Þ' => "Th",
+            'ð' => "d",
+            'Ð' => "D",
+            _ => ReadOnlySpan<char>.Empty
+        };
+    }
+
     public static string UrilizeAsGfm(string headingText)
     {
         return UrilizeAsGfm(headingText.AsSpan());
@@ -218,7 +291,8 @@ public static bool TryParseAutolink(ref StringSlice text, [NotNullWhen(true)] ou
                 }
                 state = 1;
                 break;
-            } else if (c == '@')
+            }
+            else if (c == '@')
             {
                 if (state > 0)
                 {
@@ -234,7 +308,7 @@ public static bool TryParseAutolink(ref StringSlice text, [NotNullWhen(true)] ou
         }
 
         // append ':' or '@' 
-        builder.Append(c); 
+        builder.Append(c);
 
         if (state < 0)
         {