Skip to content

Commit 754591f

Browse files
committed
StringUtil: Add GetUTF8CharacterCount()
1 parent 982035f commit 754591f

File tree

3 files changed

+108
-3
lines changed

3 files changed

+108
-3
lines changed

src/common-tests/string_tests.cpp

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
22
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
33

4-
#include "common/string_util.h"
54
#include "common/string_pool.h"
5+
#include "common/string_util.h"
66

77
#include <gtest/gtest.h>
8+
#include <string_view>
89
#include <tuple>
910

11+
using namespace std::string_view_literals;
12+
1013
TEST(StringUtil, Ellipsise)
1114
{
1215
ASSERT_EQ(StringUtil::Ellipsise("HelloWorld", 6, "..."), "Hel...");
@@ -200,7 +203,7 @@ TEST(StringUtil, Strlcpy)
200203

201204
// Truncation test
202205
result = StringUtil::Strlcpy(buffer, "hello world", sizeof(buffer));
203-
ASSERT_EQ(result, 11u); // Should return original string length
206+
ASSERT_EQ(result, 11u); // Should return original string length
204207
ASSERT_STREQ(buffer, "hello wor"); // Should be truncated and null-terminated
205208

206209
// Empty string
@@ -718,6 +721,79 @@ TEST(StringUtil, GetNextToken)
718721
ASSERT_EQ(caret, "d");
719722
}
720723

724+
TEST(StringUtil, GetUTF8CharacterCount)
725+
{
726+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount(""sv), 0u);
727+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("Hello, world!"sv), 13u);
728+
729+
// COPYRIGHT SIGN U+00A9 -> 0xC2 0xA9
730+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xC2\xA9"sv), 1u);
731+
732+
// Truncated 2-byte sequence (only leading byte present)
733+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xC2"sv), 1u);
734+
735+
// EURO SIGN U+20AC -> 0xE2 0x82 0xAC
736+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xE2\x82\xAC"sv), 1u);
737+
738+
// Truncated 3-byte sequence
739+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xE2\x82"sv), 1u);
740+
741+
// GRINNING FACE U+1F600 -> 0xF0 0x9F 0x98 0x80
742+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xF0\x9F\x98\x80"sv), 1u);
743+
744+
// Truncated 4-byte sequence
745+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xF0\x9F\x98"sv), 1u);
746+
747+
// "A" + EURO + GRINNING + "B"
748+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("A"
749+
"\xE2\x82\xAC"
750+
"\xF0\x9F\x98\x80"
751+
"B"sv),
752+
4u);
753+
754+
// Three grinning faces in a row (3 * 4 bytes)
755+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xF0\x9F\x98\x80"
756+
"\xF0\x9F\x98\x80"
757+
"\xF0\x9F\x98\x80"sv),
758+
3u);
759+
760+
// Continuation bytes (0x80 - 0xBF) appearing alone are invalid and should each count as one.
761+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\x80\x81\x82"sv), 3u);
762+
763+
// Leading bytes that are outside allowed ranges (e.g., 0xF5..0xFF)
764+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xF5\xF6\xFF"sv), 3u);
765+
766+
// 0xF4 allowed as 4-byte lead (e.g., U+10FFFF -> F4 8F BF BF)
767+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("\xF4\x8F\xBF\xBF"sv), 1u);
768+
769+
// Mix: ASCII, valid 2-byte, invalid continuation, truncated 3-byte, valid 3-byte, valid 4-byte
770+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("X"
771+
"\xC3\xA9"
772+
"\x80"
773+
"\xE2"
774+
"\xE2\x82\xAC"
775+
"\xF0\x9F\x8D\x95"sv),
776+
6u);
777+
778+
// Inline characters (not hex escapes): 'a' (ASCII), 'é' (U+00E9), '€' (U+20AC), '😀' (U+1F600), 'z'
779+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("aé€😀z"sv), 5u);
780+
781+
// Emoji-only example (two emoji characters inline)
782+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("😀😀"sv), 2u);
783+
784+
// "Hello ⣿ World 😀" but using standard euro sign U+20AC
785+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("Hello € World 😀"sv), 15u);
786+
787+
// 'A' 'é' 'B' '€' '😀' 'C' -> total 6 codepoints
788+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("AéB€😀C"sv), 6u);
789+
790+
// Inline 'é' then hex euro then inline emoji
791+
EXPECT_EQ(StringUtil::GetUTF8CharacterCount("é"
792+
"\xE2\x82\xAC"
793+
"😀"sv),
794+
3u);
795+
}
796+
721797
TEST(StringUtil, EncodeAndAppendUTF8)
722798
{
723799
std::string s;
@@ -744,7 +820,7 @@ TEST(StringUtil, EncodeAndAppendUTF8)
744820
// Test invalid character (should encode replacement character)
745821
s.clear();
746822
StringUtil::EncodeAndAppendUTF8(s, 0x110000); // Invalid
747-
ASSERT_EQ(s.size(), 3u); // Replacement character is 3 bytes
823+
ASSERT_EQ(s.size(), 3u); // Replacement character is 3 bytes
748824

749825
// Test buffer version
750826
u8 buffer[10] = {0};

src/common/string_util.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,32 @@ bool StringUtil::ParseAssignmentString(const std::string_view str, std::string_v
463463
return true;
464464
}
465465

466+
size_t StringUtil::GetUTF8CharacterCount(const std::string_view str)
467+
{
468+
size_t count = 0;
469+
470+
const size_t len = str.length();
471+
for (size_t pos = 0; pos < len;)
472+
{
473+
const u8 c = str[pos];
474+
475+
if (c < 0x80) // ASCII
476+
pos += 1;
477+
else if ((c & 0xE0) == 0xC0) // 2-byte sequence
478+
pos += 2;
479+
else if ((c & 0xF0) == 0xE0) // 3-byte sequence
480+
pos += 3;
481+
else if ((c & 0xF8) == 0xF0 && c <= 0xF4) // 4-byte sequence (limited to 0xF4)
482+
pos += 4;
483+
else // Unknown/invalid leading byte: treat as one invalid byte (replacement), advance one.
484+
pos += 1;
485+
486+
++count;
487+
}
488+
489+
return count;
490+
}
491+
466492
void StringUtil::EncodeAndAppendUTF8(std::string& s, char32_t ch)
467493
{
468494
if (ch <= 0x7F) [[likely]]

src/common/string_util.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,9 @@ ALWAYS_INLINE std::optional<std::string_view> GetNextToken(std::string_view& car
458458
/// Unicode replacement character.
459459
inline constexpr char32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
460460

461+
/// Returns the length of a UTF-8 string in codepoints.
462+
size_t GetUTF8CharacterCount(const std::string_view str);
463+
461464
/// Appends a UTF-16/UTF-32 codepoint to a UTF-8 string.
462465
void EncodeAndAppendUTF8(std::string& s, char32_t ch);
463466
size_t EncodeAndAppendUTF8(void* utf8, size_t pos, size_t size, char32_t ch);

0 commit comments

Comments
 (0)