Skip to content

ICU-23152 Unicode helper APIs #3539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions icu4c/source/common/common.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1270,6 +1270,9 @@
<CustomBuild Include="unicode\utfiterator.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\utfstring.h">
<Filter>strings</Filter>
</CustomBuild>
<CustomBuild Include="unicode\bytestrie.h">
<Filter>collections</Filter>
</CustomBuild>
Expand Down
13 changes: 11 additions & 2 deletions icu4c/source/common/unicode/bytestream.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@

#if U_SHOW_CPLUSPLUS_API

#include <type_traits>

#include "unicode/uobject.h"
#include "unicode/std_string.h"

Expand Down Expand Up @@ -260,11 +262,12 @@ class U_COMMON_API CheckedArrayByteSink : public ByteSink {

/**
* Implementation of ByteSink that writes to a "string".
* The StringClass is usually instantiated with a std::string.
* The StringClass is usually instantiated with a std::string or a std::u8string.
* @stable ICU 4.2
*/
template<typename StringClass>
class StringByteSink : public ByteSink {
using Unit = typename StringClass::value_type;
public:
/**
* Constructs a ByteSink that will append bytes to the dest string.
Expand All @@ -291,7 +294,13 @@ class StringByteSink : public ByteSink {
* @param n the number of bytes; must be non-negative
* @stable ICU 4.2
*/
virtual void Append(const char* data, int32_t n) override { dest_->append(data, n); }
virtual void Append(const char* data, int32_t n) override {
if constexpr (std::is_same_v<Unit, char>) {
dest_->append(data, n);
} else {
dest_->append(reinterpret_cast<const Unit*>(data), n);
}
}
private:
StringClass* dest_;

Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/common/unicode/uchar.h
Original file line number Diff line number Diff line change
Expand Up @@ -2918,7 +2918,7 @@ u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which);
* @return the property as a set
* @see UProperty
* @see u_hasBinaryProperty
* @see Unicode::fromUSet
* @see UnicodeSet::fromUSet
* @stable ICU 63
*/
U_CAPI const USet * U_EXPORT2
Expand Down
70 changes: 69 additions & 1 deletion icu4c/source/common/unicode/unistr.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,10 @@ class UnicodeStringAppendable; // unicode/appendable.h
*
* The UnicodeString equivalent of std::string’s clear() is remove().
*
* Starting with ICU 78, a UnicodeString is a C++ "range" of char16_t code units.
* utfStringCodePoints() and unsafeUTFStringCodePoints() can be used to iterate over
* the code points.
*
* A UnicodeString may "alias" an external array of characters
* (that is, point to it, rather than own the array)
* whose lifetime must then at least match the lifetime of the aliasing object.
Expand Down Expand Up @@ -289,12 +293,17 @@ class UnicodeStringAppendable; // unicode/appendable.h
* [User Guide Strings chapter](https://unicode-org.github.io/icu/userguide/strings#maximizing-performance-with-the-unicodestring-storage-model).
*
* @see utf.h
* @see utfiterator.h
* @see utfStringCodePoints
* @see unsafeUTFStringCodePoints
* @see CharacterIterator
* @stable ICU 2.0
*/
class U_COMMON_API UnicodeString : public Replaceable
{
public:
/** C++ boilerplate @internal */
using value_type = char16_t;

/**
* Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor
Expand Down Expand Up @@ -1767,7 +1776,8 @@ class U_COMMON_API UnicodeString : public Replaceable
* Unpaired surrogates are replaced with U+FFFD.
* Calls toUTF8().
*
* @param result A standard string (or a compatible object)
* @tparam StringClass A std::string or a std::u8string (or a compatible type)
* @param result A std::string or a std::u8string (or a compatible object)
* to which the UTF-8 version of the string is appended.
* @return The string object.
* @stable ICU 4.2
Expand All @@ -1780,6 +1790,27 @@ class U_COMMON_API UnicodeString : public Replaceable
return result;
}

#ifndef U_HIDE_DRAFT_API
/**
* Convert the UnicodeString to a UTF-8 string.
* Unpaired surrogates are replaced with U+FFFD.
* Calls toUTF8().
*
* @tparam StringClass A std::string or a std::u8string (or a compatible type)
* @return A std::string or a std::u8string (or a compatible object)
* with the UTF-8 version of the string.
* @draft ICU 78
* @see toUTF8
*/
template<typename StringClass>
StringClass toUTF8String() const {
StringClass result;
StringByteSink<StringClass> sbs(&result, length());
toUTF8(sbs);
return result;
}
#endif // U_HIDE_DRAFT_API

/**
* Convert the UnicodeString to UTF-32.
* Unpaired surrogates are replaced with U+FFFD.
Expand Down Expand Up @@ -1892,6 +1923,33 @@ class U_COMMON_API UnicodeString : public Replaceable
*/
inline UBool isBogus() const;

#ifndef U_HIDE_DRAFT_API
/**
* @return an iterator to the first code unit in this string.
* The iterator may be a pointer or a contiguous-iterator object.
* @draft ICU 78
*/
auto begin() const { return std::u16string_view(*this).begin(); }
/**
* @return an iterator to just past the last code unit in this string.
* The iterator may be a pointer or a contiguous-iterator object.
* @draft ICU 78
*/
auto end() const { return std::u16string_view(*this).end(); }
/**
* @return a reverse iterator to the last code unit in this string.
* The iterator may be a pointer or a contiguous-iterator object.
* @draft ICU 78
*/
auto rbegin() const { return std::u16string_view(*this).rbegin(); }
/**
* @return a reverse iterator to just before the first code unit in this string.
* The iterator may be a pointer or a contiguous-iterator object.
* @draft ICU 78
*/
auto rend() const { return std::u16string_view(*this).rend(); }
#endif // U_HIDE_DRAFT_API

//========================================
// Write operations
//========================================
Expand Down Expand Up @@ -2318,6 +2376,16 @@ class U_COMMON_API UnicodeString : public Replaceable
*/
UnicodeString& append(UChar32 srcChar);

#ifndef U_HIDE_DRAFT_API
/**
* Appends the code unit `c` to the UnicodeString object.
* Same as append(c) except does not return *this.
*
* @param c the code unit to append
* @draft ICU 78
*/
inline void push_back(char16_t c) { append(c); }
#endif // U_HIDE_DRAFT_API

/* Insert operations */

Expand Down
33 changes: 32 additions & 1 deletion icu4c/source/common/unicode/utf.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,39 @@

/* single-code point definitions -------------------------------------------- */

#ifndef U_HIDE_DRAFT_API

/**
* Is c a Unicode code point U+0000..U+10FFFF?
* https://www.unicode.org/glossary/#code_point
*
* @param c 32-bit code point
* @return true or false
* @draft ICU 78
* @see AllCodePoints
* @see U_IS_SCALAR_VALUE
*/
#define U_IS_CODE_POINT(c) ((uint32_t)(c)<=0x10ffff)

/**
* Is c a Unicode scalar value, that is, a non-surrogate code point?
* Only scalar values can be represented in well-formed UTF-8/16/32.
* https://www.unicode.org/glossary/#unicode_scalar_value
*
* @param c 32-bit code point
* @return true or false
* @draft ICU 78
* @see AllScalarValues
* @see U_IS_CODE_POINT
*/
#define U_IS_SCALAR_VALUE(c) ((uint32_t)(c)<0xd800 || (0xe000<=(c) && (c)<=0x10ffff))

#endif // U_HIDE_DRAFT_API

/**
* Is this code point a Unicode noncharacter?
* https://www.unicode.org/glossary/#noncharacter
*
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
Expand Down Expand Up @@ -150,7 +181,7 @@
*/
#define U_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
(0xe000<=(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))

/**
* Is this code point a BMP code point (U+0000..U+ffff)?
Expand Down
28 changes: 27 additions & 1 deletion icu4c/source/common/unicode/utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @return true or false
* @stable ICU 2.4
*/
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
#define U8_IS_SINGLE(c) ((int8_t)(c)>=0)

/**
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
Expand Down Expand Up @@ -214,6 +214,32 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_MAX_LENGTH 4

#ifndef U_HIDE_DRAFT_API

/**
* Returns the length of a well-formed UTF-8 byte sequence according to its lead byte.
* Returns 1 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @return 1..4
* @draft ICU 78
*/
#define U8_LENGTH_FROM_LEAD_BYTE(leadByte) (U8_COUNT_TRAIL_BYTES(leadByte) + 1)

/**
* Returns the length of a well-formed UTF-8 byte sequence according to its lead byte.
* Returns 1 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @return 1..4
* @draft ICU 78
*/
#define U8_LENGTH_FROM_LEAD_BYTE_UNSAFE(leadByte) (U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) + 1)

#endif // U_HIDE_DRAFT_API

/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
Expand Down
104 changes: 104 additions & 0 deletions icu4c/source/common/unicode/utfiterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,112 @@ template<typename Range>
constexpr bool range = range_type<Range>::value;

#endif

/** @internal */
template<typename CP32, bool skipSurrogates>
class CodePointsIterator {
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
public:
/** C++ iterator boilerplate @internal */
using value_type = CP32;
/** C++ iterator boilerplate @internal */
using reference = value_type;
/** C++ iterator boilerplate @internal */
using pointer = CP32 *;
/** C++ iterator boilerplate @internal */
using difference_type = int32_t;
/** C++ iterator boilerplate @internal */
using iterator_category = std::forward_iterator_tag;

/** @internal */
inline CodePointsIterator(CP32 c) : c_(c) {}
/** @internal */
inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
/** @internal */
inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
/** @internal */
inline CP32 operator*() const { return c_; }
/** @internal */
inline CodePointsIterator &operator++() { // pre-increment
++c_;
if (skipSurrogates && c_ == 0xd800) {
c_ = 0xe000;
}
return *this;
}
/** @internal */
inline CodePointsIterator operator++(int) { // post-increment
CodePointsIterator result(*this);
++(*this);
return result;
}

private:
CP32 c_;
};

} // namespace prv

/**
* A C++ "range" over all Unicode code points U+0000..U+10FFFF.
* https://www.unicode.org/glossary/#code_point
*
* Intended for test and builder code.
*
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
* @draft ICU 78
* @see U_IS_CODE_POINT
*/
template<typename CP32>
class AllCodePoints {
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
public:
/** Constructor. @draft ICU 78 */
AllCodePoints() {}
/**
* @return an iterator over all Unicode code points.
* The iterator returns CP32 integers.
* @draft ICU 78
*/
auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
/**
* @return an exclusive-end iterator over all Unicode code points.
* @draft ICU 78
*/
auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
};

/**
* A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
* That is, all code points except surrogates.
* Only scalar values can be represented in well-formed UTF-8/16/32.
* https://www.unicode.org/glossary/#unicode_scalar_value
*
* Intended for test and builder code.
*
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
* @draft ICU 78
* @see U_IS_SCALAR_VALUE
*/
template<typename CP32>
class AllScalarValues {
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
public:
/** Constructor. @draft ICU 78 */
AllScalarValues() {}
/**
* @return an iterator over all Unicode scalar values.
* The iterator returns CP32 integers.
* @draft ICU 78
*/
auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
/**
* @return an exclusive-end iterator over all Unicode scalar values.
* @draft ICU 78
*/
auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
};

/**
* Result of decoding a code unit sequence for one code point.
* Returned from non-validating Unicode string code point iterators.
Expand Down
Loading