Skip to content

Commit 12590e5

Browse files
authored
feat(link-helper): improve ASCII normalization handling (#911)
* feat(link-helper): improve ASCII normalization handling Enhanced the `Urilize` method to better handle ASCII normalization and special characters. Added support for decomposing characters when `allowOnlyAscii` is true and skipping diacritical marks. Introduced handling for special German, Scandinavian, and Icelandic characters via new helper methods: `IsSpecialScandinavianOrGermanChar` and `NormalizeScandinavianOrGermanChar`. Reorganized `using` directives for better clarity. Updated the processing loop in `Urilize` to handle normalized spans and ASCII equivalents more effectively. These changes improve link generation compatibility across various languages. * Add tests for Scandinavian and German character normalization Added tests for NormalizeScandinavianOrGermanChar method to validate character normalization for various special characters in both ASCII and non-ASCII contexts. * test(link-helper): update ASCII transliteration tests Updated test cases in `TestUrilizeOnlyAscii_Simple` to reflect changes in `LinkHelper.Urilize` behavior. Non-ASCII characters like `æ` and `ø` are now transliterated to their ASCII equivalents (`ae` and `oe`) instead of being removed.
1 parent 8c01cf0 commit 12590e5

File tree

2 files changed

+158
-15
lines changed

2 files changed

+158
-15
lines changed

src/Markdig.Tests/TestLinkHelper.cs

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,8 @@ public void TestUrilizeOnlyAscii_Simple(string input, string expectedResult)
327327
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
328328
}
329329

330-
[TestCase("bær", "br")]
331-
[TestCase("bør", "br")]
330+
[TestCase("bær", "baer")]
331+
[TestCase("bør", "boer")]
332332
[TestCase("bΘr", "br")]
333333
[TestCase("四五", "")]
334334
public void TestUrilizeOnlyAscii_NonAscii(string input, string expectedResult)
@@ -343,6 +343,75 @@ public void TestUrilizeOnlyAscii_Normalization(string input, string expectedResu
343343
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
344344
}
345345

346+
// Tests for NormalizeScandinavianOrGermanChar method mappings
347+
// These special characters are always normalized (both allowOnlyAscii=true and false)
348+
//
349+
// Note: When allowOnlyAscii=true, NFD (Canonical Decomposition) is applied first:
350+
// - German umlauts ä,ö,ü decompose to base letter + combining mark (ü -> u + ¨)
351+
// The combining mark is then stripped, leaving just the base letter (ü -> u)
352+
// - å decomposes similarly (å -> a + ˚ -> a)
353+
// - But ø, æ, ß, þ, ð do NOT decompose, so they use NormalizeScandinavianOrGermanChar
354+
//
355+
// When allowOnlyAscii=false, NormalizeScandinavianOrGermanChar is used for ALL special chars
356+
357+
// German ß (Eszett/sharp s) - does NOT decompose with NFD
358+
[TestCase("Straße", "strasse")] // ß -> ss (both allowOnlyAscii=true and false)
359+
360+
// Scandinavian æ, ø - do NOT decompose with NFD
361+
[TestCase("æble", "aeble")] // æ -> ae (both modes)
362+
[TestCase("Ærø", "aeroe")] // Æ -> Ae, ø -> oe (both modes, then lowercase)
363+
[TestCase("København", "koebenhavn")] // ø -> oe (both modes)
364+
[TestCase("Øresund", "oeresund")] // Ø -> Oe (both modes, then lowercase)
365+
366+
// Icelandic þ, ð - do NOT decompose with NFD
367+
[TestCase("þing", "thing")] // þ (thorn) -> th (both modes)
368+
[TestCase("bað", "bad")] // ð (eth) -> d (both modes)
369+
370+
// Mixed special characters (only chars that behave same in both modes)
371+
[TestCase("øst-æble", "oest-aeble")] // ø->oe, æ->ae (both modes)
372+
public void TestUrilizeScandinavianGermanChars(string input, string expectedResult)
373+
{
374+
// These transformations apply regardless of allowOnlyAscii flag
375+
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
376+
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
377+
}
378+
379+
// Tests specific to allowOnlyAscii=true behavior
380+
// German umlauts (ä, ö, ü) and å decompose with NFD, so they become base letter only
381+
[TestCase("schön", "schon")] // ö decomposes to o (NFD strips combining mark)
382+
[TestCase("Mädchen", "madchen")] // ä decomposes to a
383+
[TestCase("Übung", "ubung")] // Ü decomposes to U (then lowercase to u)
384+
[TestCase("Düsseldorf", "dusseldorf")] // ü decomposes to u
385+
[TestCase("Käse", "kase")] // ä decomposes to a
386+
[TestCase("gå", "ga")] // å decomposes to a
387+
[TestCase("Ålesund", "alesund")] // Å decomposes to A (then lowercase)
388+
[TestCase("grüßen", "grussen")] // ü decomposes to u, ß -> ss
389+
[TestCase("Þór", "thor")] // Þ -> Th, ó decomposes to o (then lowercase)
390+
[TestCase("Íslandsbanki", "islandsbanki")] // Í decomposes to I (then lowercase)
391+
public void TestUrilizeOnlyAscii_GermanUmlautsDecompose(string input, string expectedResult)
392+
{
393+
// With allowOnlyAscii=true, these characters decompose via NFD and lose their diacritics
394+
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
395+
}
396+
397+
// Tests specific to allowOnlyAscii=false behavior
398+
// All special chars use NormalizeScandinavianOrGermanChar (including ä, ö, ü, å)
399+
[TestCase("schön", "schoen")] // ö -> oe (NormalizeScandinavianOrGermanChar)
400+
[TestCase("Mädchen", "maedchen")] // ä -> ae
401+
[TestCase("Übung", "uebung")] // Ü -> Ue (then lowercase)
402+
[TestCase("Düsseldorf", "duesseldorf")] // ü -> ue
403+
[TestCase("Käse", "kaese")] // ä -> ae
404+
[TestCase("gå", "gaa")] // å -> aa
405+
[TestCase("Ålesund", "aalesund")] // Å -> Aa (then lowercase)
406+
[TestCase("grüßen", "gruessen")] // ü -> ue, ß -> ss
407+
[TestCase("Þór", "thór")] // Þ -> Th (then lowercase 'th'), ó is kept as-is
408+
[TestCase("Íslandsbanki", "íslandsbanki")] // í is kept as-is when allowOnlyAscii=false
409+
public void TestUrilizeNonAscii_GermanUmlautsExpanded(string input, string expectedResult)
410+
{
411+
// With allowOnlyAscii=false, these characters use NormalizeScandinavianOrGermanChar
412+
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
413+
}
414+
346415
[TestCase("123", "")]
347416
[TestCase("1,-b", "b")]
348417
[TestCase("b1,-", "b1")] // Not Pandoc equivalent: b1-
@@ -360,11 +429,11 @@ public void TestUrilizeNonAscii_NonAsciiNumeric(string input, string expectedRes
360429
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
361430
}
362431

363-
[TestCase("bær", "bær")]
364-
[TestCase("æ5el", "æ5el")]
365-
[TestCase("-æ5el", "æ5el")]
366-
[TestCase("-frø-", "frø")]
367-
[TestCase("-fr-ø", "fr-ø")]
432+
[TestCase("bær", "baer")]
433+
[TestCase("æ5el", "ae5el")]
434+
[TestCase("-æ5el", "ae5el")]
435+
[TestCase("-frø-", "froe")]
436+
[TestCase("-fr-ø", "fr-oe")]
368437
public void TestUrilizeNonAscii_Simple(string input, string expectedResult)
369438
{
370439
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
@@ -393,4 +462,4 @@ public void TestUnicodeInDomainNameOfLinkReferenceDefinition()
393462
{
394463
TestParser.TestSpec("[Foo]\n\n[Foo]: http://ünicode.com", "<p><a href=\"http://xn--nicode-2ya.com\">Foo</a></p>");
395464
}
396-
}
465+
}

src/Markdig/Helpers/LinkHelper.cs

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
// This file is licensed under the BSD-Clause 2 license.
33
// See the license.txt file in the project root for more information.
44

5+
using Markdig.Syntax;
56
using System.Buffers;
67
using System.Diagnostics;
78
using System.Diagnostics.CodeAnalysis;
9+
using System.Globalization;
810
using System.Runtime.CompilerServices;
9-
using Markdig.Syntax;
11+
using System.Text;
1012

1113
namespace Markdig.Helpers;
1214

@@ -30,11 +32,38 @@ public static string Urilize(ReadOnlySpan<char> headingText, bool allowOnlyAscii
3032
var headingBuffer = new ValueStringBuilder(stackalloc char[ValueStringBuilder.StackallocThreshold]);
3133
bool hasLetter = keepOpeningDigits && headingText.Length > 0 && char.IsLetterOrDigit(headingText[0]);
3234
bool previousIsSpace = false;
33-
for (int i = 0; i < headingText.Length; i++)
35+
36+
// First normalize the string to decompose characters if allowOnlyAscii is true
37+
string normalizedString = string.Empty;
38+
if (allowOnlyAscii)
3439
{
35-
var c = headingText[i];
36-
var normalized = allowOnlyAscii ? CharNormalizer.ConvertToAscii(c) : null;
37-
for (int j = 0; j < (normalized?.Length ?? 1); j++)
40+
normalizedString = headingText.ToString().Normalize(NormalizationForm.FormD);
41+
}
42+
43+
var textToProcess = string.IsNullOrEmpty(normalizedString) ? headingText : normalizedString.AsSpan();
44+
45+
for (int i = 0; i < textToProcess.Length; i++)
46+
{
47+
var c = textToProcess[i];
48+
49+
// Skip combining diacritical marks when normalized
50+
if (allowOnlyAscii && CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
51+
{
52+
continue;
53+
}
54+
55+
// Handle German umlauts and Norwegian/Danish characters explicitly (they don't decompose properly)
56+
ReadOnlySpan<char> normalized;
57+
if (IsSpecialScandinavianOrGermanChar(c))
58+
{
59+
normalized = NormalizeScandinavianOrGermanChar(c);
60+
}
61+
else
62+
{
63+
normalized = allowOnlyAscii ? CharNormalizer.ConvertToAscii(c) : null;
64+
}
65+
66+
for (int j = 0; j < (normalized.Length < 1 ? 1 : normalized.Length); j++)
3867
{
3968
if (normalized != null)
4069
{
@@ -101,6 +130,50 @@ public static string Urilize(ReadOnlySpan<char> headingText, bool allowOnlyAscii
101130
return headingBuffer.ToString();
102131
}
103132

133+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
134+
private static bool IsSpecialScandinavianOrGermanChar(char c)
135+
{
136+
// German umlauts and ß
137+
// Norwegian/Danish/Swedish æ, ø, å
138+
// Icelandic þ (thorn), ð (eth)
139+
return c == 'ä' || c == 'ö' || c == 'ü' ||
140+
c == 'Ä' || c == 'Ö' || c == 'Ü' ||
141+
c == 'ß' ||
142+
c == 'æ' || c == 'ø' || c == 'å' ||
143+
c == 'Æ' || c == 'Ø' || c == 'Å' ||
144+
c == 'þ' || c == 'ð' ||
145+
c == 'Þ' || c == 'Ð';
146+
}
147+
148+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
149+
private static ReadOnlySpan<char> NormalizeScandinavianOrGermanChar(char c)
150+
{
151+
return c switch
152+
{
153+
// German
154+
'ä' => "ae",
155+
'ö' => "oe",
156+
'ü' => "ue",
157+
'Ä' => "Ae",
158+
'Ö' => "Oe",
159+
'Ü' => "Ue",
160+
'ß' => "ss",
161+
// Norwegian/Danish/Swedish
162+
'æ' => "ae",
163+
'ø' => "oe",
164+
'å' => "aa",
165+
'Æ' => "Ae",
166+
'Ø' => "Oe",
167+
'Å' => "Aa",
168+
// Icelandic
169+
'þ' => "th",
170+
'Þ' => "Th",
171+
'ð' => "d",
172+
'Ð' => "D",
173+
_ => ReadOnlySpan<char>.Empty
174+
};
175+
}
176+
104177
public static string UrilizeAsGfm(string headingText)
105178
{
106179
return UrilizeAsGfm(headingText.AsSpan());
@@ -218,7 +291,8 @@ public static bool TryParseAutolink(ref StringSlice text, [NotNullWhen(true)] ou
218291
}
219292
state = 1;
220293
break;
221-
} else if (c == '@')
294+
}
295+
else if (c == '@')
222296
{
223297
if (state > 0)
224298
{
@@ -234,7 +308,7 @@ public static bool TryParseAutolink(ref StringSlice text, [NotNullWhen(true)] ou
234308
}
235309

236310
// append ':' or '@'
237-
builder.Append(c);
311+
builder.Append(c);
238312

239313
if (state < 0)
240314
{

0 commit comments

Comments
 (0)