11// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
22// SPDX-License-Identifier: CC-BY-NC-ND-4.0
33
4- #include " common/string_util.h"
54#include " common/string_pool.h"
5+ #include " common/string_util.h"
66
77#include < gtest/gtest.h>
8+ #include < string_view>
89#include < tuple>
910
11+ using namespace std ::string_view_literals;
12+
1013TEST (StringUtil, Ellipsise)
1114{
1215 ASSERT_EQ (StringUtil::Ellipsise (" HelloWorld" , 6 , " ..." ), " Hel..." );
@@ -200,7 +203,7 @@ TEST(StringUtil, Strlcpy)
200203
201204 // Truncation test
202205 result = StringUtil::Strlcpy (buffer, " hello world" , sizeof (buffer));
203- ASSERT_EQ (result, 11u ); // Should return original string length
206+ ASSERT_EQ (result, 11u ); // Should return original string length
204207 ASSERT_STREQ (buffer, " hello wor" ); // Should be truncated and null-terminated
205208
206209 // Empty string
@@ -718,6 +721,79 @@ TEST(StringUtil, GetNextToken)
718721 ASSERT_EQ (caret, " d" );
719722}
720723
724+ TEST (StringUtil, GetUTF8CharacterCount)
725+ {
726+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" " sv), 0u );
727+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" Hello, world!" sv), 13u );
728+
729+ // COPYRIGHT SIGN U+00A9 -> 0xC2 0xA9
730+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xC2\xA9 " sv), 1u );
731+
732+ // Truncated 2-byte sequence (only leading byte present)
733+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xC2 " sv), 1u );
734+
735+ // EURO SIGN U+20AC -> 0xE2 0x82 0xAC
736+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xE2\x82\xAC " sv), 1u );
737+
738+ // Truncated 3-byte sequence
739+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xE2\x82 " sv), 1u );
740+
741+ // GRINNING FACE U+1F600 -> 0xF0 0x9F 0x98 0x80
742+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xF0\x9F\x98\x80 " sv), 1u );
743+
744+ // Truncated 4-byte sequence
745+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xF0\x9F\x98 " sv), 1u );
746+
747+ // "A" + EURO + GRINNING + "B"
748+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" A"
749+ " \xE2\x82\xAC "
750+ " \xF0\x9F\x98\x80 "
751+ " B" sv),
752+ 4u );
753+
754+ // Three grinning faces in a row (3 * 4 bytes)
755+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xF0\x9F\x98\x80 "
756+ " \xF0\x9F\x98\x80 "
757+ " \xF0\x9F\x98\x80 " sv),
758+ 3u );
759+
760+ // Continuation bytes (0x80 - 0xBF) appearing alone are invalid and should each count as one.
761+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \x80\x81\x82 " sv), 3u );
762+
763+ // Leading bytes that are outside allowed ranges (e.g., 0xF5..0xFF)
764+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xF5\xF6\xFF " sv), 3u );
765+
766+ // 0xF4 allowed as 4-byte lead (e.g., U+10FFFF -> F4 8F BF BF)
767+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" \xF4\x8F\xBF\xBF " sv), 1u );
768+
769+ // Mix: ASCII, valid 2-byte, invalid continuation, truncated 3-byte, valid 3-byte, valid 4-byte
770+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" X"
771+ " \xC3\xA9 "
772+ " \x80 "
773+ " \xE2 "
774+ " \xE2\x82\xAC "
775+ " \xF0\x9F\x8D\x95 " sv),
776+ 6u );
777+
778+ // Inline characters (not hex escapes): 'a' (ASCII), 'é' (U+00E9), '€' (U+20AC), '😀' (U+1F600), 'z'
779+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" aé€😀z" sv), 5u );
780+
781+ // Emoji-only example (two emoji characters inline)
782+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" 😀😀" sv), 2u );
783+
784+ // "Hello ⣿ World 😀" but using standard euro sign U+20AC
785+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" Hello € World 😀" sv), 15u );
786+
787+ // 'A' 'é' 'B' '€' '😀' 'C' -> total 6 codepoints
788+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" AéB€😀C" sv), 6u );
789+
790+ // Inline 'é' then hex euro then inline emoji
791+ EXPECT_EQ (StringUtil::GetUTF8CharacterCount (" é"
792+ " \xE2\x82\xAC "
793+ " 😀" sv),
794+ 3u );
795+ }
796+
721797TEST (StringUtil, EncodeAndAppendUTF8)
722798{
723799 std::string s;
@@ -744,7 +820,7 @@ TEST(StringUtil, EncodeAndAppendUTF8)
744820 // Test invalid character (should encode replacement character)
745821 s.clear ();
746822 StringUtil::EncodeAndAppendUTF8 (s, 0x110000 ); // Invalid
747- ASSERT_EQ (s.size (), 3u ); // Replacement character is 3 bytes
823+ ASSERT_EQ (s.size (), 3u ); // Replacement character is 3 bytes
748824
749825 // Test buffer version
750826 u8 buffer[10 ] = {0 };
0 commit comments