Skip to content

Commit f43d988

Browse files
committed
Recognize supplementary characters
1 parent 8c01cf0 commit f43d988

File tree

7 files changed

+232
-22
lines changed

7 files changed

+232
-22
lines changed

src/Markdig.Tests/TestEmphasisPlus.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ public void NormalStrongNormal()
2222
TestParser.TestSpec("normal ***Strong emphasis*** normal", "<p>normal <em><strong>Strong emphasis</strong></em> normal</p>", "");
2323
}
2424

25+
[Test]
26+
public void SupplementaryPunctuation()
27+
{
28+
TestParser.TestSpec("a*a∇*a\n\na*∇a*a\n\na*a𝜵*a\n\na*𝜵a*a\n\na*𐬼a*a\n\na*a𐬼*a", "<p>a*a∇*a</p>\n<p>a*∇a*a</p>\n<p>a*a𝜵*a</p>\n<p>a*𝜵a*a</p>\n<p>a*𐬼a*a</p>\n<p>a*a𐬼*a</p>", "");
29+
}
30+
31+
[Test]
32+
public void RecognizeSupplementaryChars()
33+
{
34+
TestParser.TestSpec("🌶️**𰻞**🍜**𰻞**🌶️**麺**🍜", "<p>🌶️<strong>𰻞</strong>🍜<strong>𰻞</strong>🌶️<strong>麺</strong>🍜</p>", "");
35+
}
36+
37+
2538
[Test]
2639
public void OpenEmphasisHasConvenientContentStringSlice()
2740
{

src/Markdig.Tests/TestSmartyPants.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,14 @@ public void MappingCanBeReconfigured_HandlesRemovedMappings()
3131

3232
TestParser.TestSpec("<<test>>", "<p>&laquo;test&raquo;</p>", pipeline);
3333
}
34+
35+
[Test]
36+
public void RecognizesSupplementaryCharacters()
37+
{
38+
var pipeline = new MarkdownPipelineBuilder()
39+
.UseSmartyPants()
40+
.Build();
41+
42+
TestParser.TestSpec("\"𝜵\"𠮷\"𝜵\"𩸽\"", "<p>&ldquo;𝜵&ldquo;𠮷&rdquo;𝜵&ldquo;𩸽&rdquo;</p>", pipeline);
43+
}
3444
}

src/Markdig/Extensions/SmartyPants/SmartyPantsInlineParser.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public override bool Match(InlineProcessor processor, ref StringSlice slice)
3636
// -- – &ndash; 'ndash'
3737
// --- — &mdash; 'mdash'
3838

39-
var pc = slice.PeekCharExtra(-1);
39+
var pc = slice.PeekRuneExtra(-1);
4040
var c = slice.CurrentChar;
4141
var openingChar = c;
4242

@@ -93,9 +93,9 @@ public override bool Match(InlineProcessor processor, ref StringSlice slice)
9393
}
9494

9595
// Skip char
96-
c = slice.NextChar();
96+
var next = slice.NextRune();
9797

98-
CharHelper.CheckOpenCloseDelimiter(pc, c, false, out bool canOpen, out bool canClose);
98+
CharHelper.CheckOpenCloseDelimiter(pc, next, false, out bool canOpen, out bool canClose);
9999

100100
bool postProcess = false;
101101

src/Markdig/Helpers/CharHelper.cs

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
// Copyright (c) Alexandre Mutel. All rights reserved.
2-
// This file is licensed under the BSD-Clause 2 license.
2+
// This file is licensed under the BSD-Clause 2 license.
33
// See the license.txt file in the project root for more information.
44

55
using System.Buffers;
66
using System.Diagnostics;
77
using System.Globalization;
88
using System.Runtime.CompilerServices;
9+
using System.Text;
910

1011
namespace Markdig.Helpers;
1112

@@ -69,10 +70,10 @@ public static class CharHelper
6970
private static readonly SearchValues<char> s_escapableSymbolChars = SearchValues.Create("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~•");
7071

7172
[MethodImpl(MethodImplOptions.AggressiveInlining)]
72-
private static bool IsPunctuationException(char c) =>
73-
c is '−' or '-' or '†' or '‡';
73+
private static bool IsPunctuationException(Rune c) =>
74+
c.IsBmp && (char)c.Value is '−' or '-' or '†' or '‡';
7475

75-
public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWord, out bool canOpen, out bool canClose)
76+
public static void CheckOpenCloseDelimiter(Rune pc, Rune c, bool enableWithinWord, out bool canOpen, out bool canClose)
7677
{
7778
pc.CheckUnicodeCategory(out bool prevIsWhiteSpace, out bool prevIsPunctuation);
7879
c.CheckUnicodeCategory(out bool nextIsWhiteSpace, out bool nextIsPunctuation);
@@ -100,13 +101,13 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor
100101
if (!enableWithinWord)
101102
{
102103
var temp = canOpen;
103-
// A single _ character can open emphasis iff it is part of a left-flanking delimiter run and either
104-
// (a) not part of a right-flanking delimiter run or
104+
// A single _ character can open emphasis iff it is part of a left-flanking delimiter run and either
105+
// (a) not part of a right-flanking delimiter run or
105106
// (b) part of a right-flanking delimiter run preceded by punctuation.
106107
canOpen = canOpen && (!canClose || prevIsPunctuation);
107108

108109
// A single _ character can close emphasis iff it is part of a right-flanking delimiter run and either
109-
// (a) not part of a left-flanking delimiter run or
110+
// (a) not part of a left-flanking delimiter run or
110111
// (b) part of a left-flanking delimiter run followed by punctuation.
111112
canClose = canClose && (!temp || nextIsPunctuation);
112113
}
@@ -199,6 +200,9 @@ public static bool IsWhitespace(this char c)
199200
return IsWhitespaceRare(c);
200201
}
201202

203+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
204+
public static bool IsWhitespace(this Rune r) => r.IsBmp && IsWhitespace((char)r.Value);
205+
202206
[MethodImpl(MethodImplOptions.AggressiveInlining)]
203207
public static bool IsWhiteSpaceOrZero(this char c)
204208
{
@@ -263,6 +267,52 @@ public static void CheckUnicodeCategory(this char c, out bool space, out bool pu
263267
}
264268
}
265269

270+
#if !(NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER)
271+
private static Lazy<Func<int, UnicodeCategory>?> GetUnicodeCategoryReflection =
272+
new(() => (Func<int, UnicodeCategory>?)typeof(char).GetMethod("GetUnicodeCategory", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static)?.CreateDelegate(
273+
typeof(Func<int, UnicodeCategory>)));
274+
#endif
275+
276+
// Check if a char is a space or a punctuation
277+
public static void CheckUnicodeCategory(this Rune c, out bool space, out bool punctuation)
278+
{
279+
if (IsWhitespace(c))
280+
{
281+
space = true;
282+
punctuation = false;
283+
}
284+
else if (c.Value <= 127)
285+
{
286+
space = c.Value == 0;
287+
punctuation = c.IsBmp && IsAsciiPunctuationOrZero((char)c.Value);
288+
}
289+
else
290+
{
291+
space = false;
292+
punctuation = (CommonMarkPunctuationCategoryMask & (1 <<
293+
#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
294+
(int)CharUnicodeInfo.GetUnicodeCategory(c.Value)
295+
#else
296+
(int)GetUnicodeCategoryFallback(c)
297+
#endif
298+
)) != 0;
299+
}
300+
301+
#if !(NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER)
302+
static UnicodeCategory GetUnicodeCategoryFallback(Rune c)
303+
{
304+
if (c.IsBmp) return CharUnicodeInfo.GetUnicodeCategory((char)c.Value);
305+
306+
if (GetUnicodeCategoryReflection.Value is Func<int, UnicodeCategory> GetUnicodeCategory)
307+
{
308+
return GetUnicodeCategory(c.Value);
309+
}
310+
311+
return CharUnicodeInfo.GetUnicodeCategory(c.ToString(), 0);
312+
}
313+
#endif
314+
}
315+
266316
[MethodImpl(MethodImplOptions.AggressiveInlining)]
267317
internal static bool IsSpaceOrPunctuationForGFMAutoLink(char c)
268318
{
@@ -309,15 +359,15 @@ public static bool IsZero(this char c)
309359
[MethodImpl(MethodImplOptions.AggressiveInlining)]
310360
public static bool IsSpace(this char c)
311361
{
312-
// 2.1 Characters and lines
362+
// 2.1 Characters and lines
313363
// A space is U+0020.
314364
return c == ' ';
315365
}
316366

317367
[MethodImpl(MethodImplOptions.AggressiveInlining)]
318368
public static bool IsTab(this char c)
319369
{
320-
// 2.1 Characters and lines
370+
// 2.1 Characters and lines
321371
// A space is U+0009.
322372
return c == '\t';
323373
}

src/Markdig/Helpers/StringSlice.cs

Lines changed: 134 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
// Copyright (c) Alexandre Mutel. All rights reserved.
2-
// This file is licensed under the BSD-Clause 2 license.
2+
// This file is licensed under the BSD-Clause 2 license.
33
// See the license.txt file in the project root for more information.
44

55
#nullable disable
66

77
using System.Runtime.CompilerServices;
88
using System.Runtime.InteropServices;
9+
using System.Text;
910

1011
namespace Markdig.Helpers;
1112

@@ -114,7 +115,7 @@ internal StringSlice(string text, int start, int end, NewLine newLine, bool dumm
114115
public NewLine NewLine;
115116

116117
/// <summary>
117-
/// Gets the current character.
118+
/// Gets the current character .
118119
/// </summary>
119120
public readonly char CurrentChar
120121
{
@@ -125,6 +126,31 @@ public readonly char CurrentChar
125126
}
126127
}
127128

129+
/// <summary>
130+
/// Gets the current rune (Unicode scalar value). Recognizes supplementary code points that cannot be covered by a single character.
131+
/// </summary>
132+
public readonly Rune CurrentRune
133+
{
134+
get
135+
{
136+
int start = Start;
137+
if (start > End) return default;
138+
var first = Text[start];
139+
if (!char.IsSurrogate(first)) return new Rune(first);
140+
if (char.IsHighSurrogate(first))
141+
{
142+
if (start + 1 > End) return default;
143+
var second = Text[start + 1];
144+
if (!char.IsLowSurrogate(second)) return default;
145+
return new Rune(first, second);
146+
}
147+
if (start < 1) return default;
148+
var trueFirst = Text[start - 1];
149+
if (!char.IsHighSurrogate(trueFirst)) return default;
150+
return new Rune(trueFirst, first);
151+
}
152+
}
153+
128154
/// <summary>
129155
/// Gets a value indicating whether this instance is empty.
130156
/// </summary>
@@ -145,6 +171,35 @@ public readonly char this[int index]
145171
get => Text[index];
146172
}
147173

174+
/// <summary>
175+
/// Gets the Unicode scalar value (rune) at the specified index relative to the slice.
176+
/// Recognizes supplementary code points that cannot be covered by a single character.
177+
/// </summary>
178+
/// <param name="index">The index relative to the slice.</param>
179+
/// <returns>The rune at the specified index or the default value (refers to <c>'\0'</c>) if the index is out of range or the rune cannot be determined.</returns>
180+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
181+
public Rune RuneAt(int index)
182+
{
183+
var first = Text[index];
184+
if (!char.IsSurrogate(first))
185+
return new Rune(first);
186+
if (char.IsHighSurrogate(first) && index + 1 <= End)
187+
{
188+
var second = Text[index + 1];
189+
if (char.IsLowSurrogate(second))
190+
return new Rune(first, second);
191+
return default;
192+
}
193+
else if (index >= Start + 1)
194+
{
195+
var trueFirst = Text[index - 1];
196+
if (char.IsHighSurrogate(trueFirst))
197+
return new Rune(trueFirst, first);
198+
return default;
199+
}
200+
return default;
201+
}
202+
148203

149204
/// <summary>
150205
/// Goes to the next character, incrementing the <see cref="Start" /> position.
@@ -166,6 +221,36 @@ public char NextChar()
166221
return Text[start];
167222
}
168223

224+
/// <summary>
225+
/// Goes to the next rune, incrementing the <see cref="Start"/> position.
226+
/// </summary>
227+
/// <returns>
228+
/// The next rune. If none, returns default.
229+
/// </returns>
230+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
231+
public Rune NextRune()
232+
{
233+
int start = Start;
234+
if (start >= End)
235+
{
236+
Start = End + 1;
237+
return default;
238+
}
239+
start++;
240+
Start = start;
241+
var first = Text[start];
242+
if (!char.IsSurrogate(first))
243+
return new Rune(first);
244+
if (!char.IsHighSurrogate(first) || start + 1 > End)
245+
return default;
246+
var second = Text[start + 1];
247+
if (!char.IsLowSurrogate(second))
248+
return default;
249+
start++;
250+
Start = start;
251+
return new Rune(first, second);
252+
}
253+
169254
/// <summary>
170255
/// Goes to the next character, incrementing the <see cref="Start" /> position.
171256
/// </summary>
@@ -244,6 +329,53 @@ public readonly char PeekCharExtra(int offset)
244329
return (uint)index < (uint)text.Length ? text[index] : '\0';
245330
}
246331

332+
/// <summary>
333+
/// Peeks a rune at the specified offset from the current beginning of the slice
334+
/// without using the range <see cref="Start"/> or <see cref="End"/>, returns default if outside the <see cref="Text"/>.
335+
/// Recognizes supplementary code points that cannot be covered by a single character.
336+
/// </summary>
337+
/// <param name="offset">The offset.</param>
338+
/// <returns>The rune at the specified offset, returns default if none.</returns>
339+
///
340+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
341+
public readonly Rune PeekRuneExtra(int offset)
342+
{
343+
var index = Start + offset;
344+
var text = Text;
345+
if ((uint)index >= (uint)text.Length)
346+
{
347+
return default;
348+
}
349+
var resultOrLowSurrogate = text[index];
350+
if (!char.IsSurrogate(resultOrLowSurrogate))
351+
{
352+
return new Rune(resultOrLowSurrogate);
353+
}
354+
if (!char.IsHighSurrogate(resultOrLowSurrogate))
355+
{
356+
if (index + 1 >= text.Length)
357+
{
358+
return default;
359+
}
360+
var lowSurrogate = text[index + 1];
361+
if (!char.IsLowSurrogate(lowSurrogate))
362+
{
363+
return default;
364+
}
365+
return new Rune(resultOrLowSurrogate, lowSurrogate);
366+
}
367+
if (index <= 1)
368+
{
369+
return default;
370+
}
371+
var highSurrogate = text[index - 1];
372+
if (!char.IsHighSurrogate(highSurrogate))
373+
{
374+
return default;
375+
}
376+
return new Rune(highSurrogate, resultOrLowSurrogate);
377+
}
378+
247379
/// <summary>
248380
/// Matches the specified text.
249381
/// </summary>

src/Markdig/Markdig.targets

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828
<PackageReference Include="System.Memory" Version="4.6.0" />
2929
</ItemGroup>
3030

31+
<ItemGroup Condition=" '$(TargetFramework)' == 'net462' OR '$(TargetFramework)' == 'netstandard2.0' OR '$(TargetFramework)' == 'netstandard2.1'">
32+
<PackageReference Include="Shim.System.Text.Rune" Version="6.0.2" />
33+
</ItemGroup>
34+
3135
<ItemGroup>
3236
<None Include="../../img/markdig.png" Pack="true" PackagePath="" />
3337
<None Include="../../readme.md" Pack="true" PackagePath="/"/>

0 commit comments

Comments
 (0)