unicode-org · markusicu · Jul 22, 2025 · Jul 18, 2025
diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters
@@ -1270,6 +1270,9 @@
     <CustomBuild Include="unicode\utfiterator.h">
       <Filter>strings</Filter>
     </CustomBuild>
+    <CustomBuild Include="unicode\utfstring.h">
+      <Filter>strings</Filter>
+    </CustomBuild>
     <CustomBuild Include="unicode\bytestrie.h">
       <Filter>collections</Filter>
     </CustomBuild>

diff --git a/icu4c/source/common/unicode/bytestream.h b/icu4c/source/common/unicode/bytestream.h
@@ -41,6 +41,8 @@
 
 #if U_SHOW_CPLUSPLUS_API
 
+#include <type_traits>
+
 #include "unicode/uobject.h"
 #include "unicode/std_string.h"
 
@@ -260,11 +262,12 @@ class U_COMMON_API CheckedArrayByteSink : public ByteSink {
 
 /** 
  * Implementation of ByteSink that writes to a "string".
- * The StringClass is usually instantiated with a std::string.
+ * The StringClass is usually instantiated with a std::string or a std::u8string.
  * @stable ICU 4.2
  */
 template<typename StringClass>
 class StringByteSink : public ByteSink {
+  using Unit = typename StringClass::value_type;
  public:
   /**
    * Constructs a ByteSink that will append bytes to the dest string.
@@ -291,7 +294,13 @@ class StringByteSink : public ByteSink {
    * @param n the number of bytes; must be non-negative
    * @stable ICU 4.2
    */
-  virtual void Append(const char* data, int32_t n) override { dest_->append(data, n); }
+  virtual void Append(const char* data, int32_t n) override {
+    if constexpr (std::is_same_v<Unit, char>) {
+      dest_->append(data, n);
+    } else {
+      dest_->append(reinterpret_cast<const Unit*>(data), n);
+    }
+  }
  private:
   StringClass* dest_;
 

diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h
@@ -2918,7 +2918,7 @@ u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which);
  * @return the property as a set
  * @see UProperty
  * @see u_hasBinaryProperty
- * @see Unicode::fromUSet
+ * @see UnicodeSet::fromUSet
  * @stable ICU 63
  */
 U_CAPI const USet * U_EXPORT2

diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h
@@ -215,6 +215,10 @@ class UnicodeStringAppendable;  // unicode/appendable.h
  *
  * The UnicodeString equivalent of std::string’s clear() is remove().
  *
+ * Starting with ICU 78, a UnicodeString is a C++ "range" of char16_t code units.
+ * utfStringCodePoints() and unsafeUTFStringCodePoints() can be used to iterate over
+ * the code points.
+ *
  * A UnicodeString may "alias" an external array of characters
  * (that is, point to it, rather than own the array)
  * whose lifetime must then at least match the lifetime of the aliasing object.
@@ -289,12 +293,17 @@ class UnicodeStringAppendable;  // unicode/appendable.h
  * [User Guide Strings chapter](https://unicode-org.github.io/icu/userguide/strings#maximizing-performance-with-the-unicodestring-storage-model).
  *
  * @see utf.h
+ * @see utfiterator.h
+ * @see utfStringCodePoints
+ * @see unsafeUTFStringCodePoints
  * @see CharacterIterator
  * @stable ICU 2.0
  */
 class U_COMMON_API UnicodeString : public Replaceable
 {
 public:
+  /** C++ boilerplate @internal */
+  using value_type = char16_t;
 
   /**
    * Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor
@@ -1767,7 +1776,8 @@ class U_COMMON_API UnicodeString : public Replaceable
    * Unpaired surrogates are replaced with U+FFFD.
    * Calls toUTF8().
    *
-   * @param result A standard string (or a compatible object)
+   * @tparam StringClass A std::string or a std::u8string (or a compatible type)
+   * @param result A std::string or a std::u8string (or a compatible object)
    *        to which the UTF-8 version of the string is appended.
    * @return The string object.
    * @stable ICU 4.2
@@ -1780,6 +1790,27 @@ class U_COMMON_API UnicodeString : public Replaceable
     return result;
   }
 
+#ifndef U_HIDE_DRAFT_API
+  /**
+   * Convert the UnicodeString to a UTF-8 string.
+   * Unpaired surrogates are replaced with U+FFFD.
+   * Calls toUTF8().
+   *
+   * @tparam StringClass A std::string or a std::u8string (or a compatible type)
+   * @return A std::string or a std::u8string (or a compatible object)
+   *        with the UTF-8 version of the string.
+   * @draft ICU 78
+   * @see toUTF8
+   */
+  template<typename StringClass>
+  StringClass toUTF8String() const {
+    StringClass result;
+    StringByteSink<StringClass> sbs(&result, length());
+    toUTF8(sbs);
+    return result;
+  }
+#endif  // U_HIDE_DRAFT_API
+
   /**
    * Convert the UnicodeString to UTF-32.
    * Unpaired surrogates are replaced with U+FFFD.
@@ -1892,6 +1923,33 @@ class U_COMMON_API UnicodeString : public Replaceable
    */
   inline UBool isBogus() const;
 
+#ifndef U_HIDE_DRAFT_API
+  /**
+   * @return an iterator to the first code unit in this string.
+   *     The iterator may be a pointer or a contiguous-iterator object.
+   * @draft ICU 78
+   */
+  auto begin() const { return std::u16string_view(*this).begin(); }
+  /**
+   * @return an iterator to just past the last code unit in this string.
+   *     The iterator may be a pointer or a contiguous-iterator object.
+   * @draft ICU 78
+   */
+  auto end() const { return std::u16string_view(*this).end(); }
+  /**
+   * @return a reverse iterator to the last code unit in this string.
+   *     The iterator may be a pointer or a contiguous-iterator object.
+   * @draft ICU 78
+   */
+  auto rbegin() const { return std::u16string_view(*this).rbegin(); }
+  /**
+   * @return a reverse iterator to just before the first code unit in this string.
+   *     The iterator may be a pointer or a contiguous-iterator object.
+   * @draft ICU 78
+   */
+  auto rend() const { return std::u16string_view(*this).rend(); }
+#endif  // U_HIDE_DRAFT_API
+
   //========================================
   // Write operations
   //========================================
@@ -2318,6 +2376,16 @@ class U_COMMON_API UnicodeString : public Replaceable
    */
   UnicodeString& append(UChar32 srcChar);
 
+#ifndef U_HIDE_DRAFT_API
+  /**
+   * Appends the code unit `c` to the UnicodeString object.
+   * Same as append(c) except does not return *this.
+   *
+   * @param c the code unit to append
+   * @draft ICU 78
+   */
+  inline void push_back(char16_t c) { append(c); }
+#endif  // U_HIDE_DRAFT_API
 
   /* Insert operations */
 

diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h
@@ -121,8 +121,39 @@
 
 /* single-code point definitions -------------------------------------------- */
 
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Is c a Unicode code point U+0000..U+10FFFF?
+ * https://www.unicode.org/glossary/#code_point
+ *
+ * @param c 32-bit code point
+ * @return true or false
+ * @draft ICU 78
+ * @see AllCodePoints
+ * @see U_IS_SCALAR_VALUE
+ */
+#define U_IS_CODE_POINT(c) ((uint32_t)(c)<=0x10ffff)
+
+/**
+ * Is c a Unicode scalar value, that is, a non-surrogate code point?
+ * Only scalar values can be represented in well-formed UTF-8/16/32.
+ * https://www.unicode.org/glossary/#unicode_scalar_value
+ *
+ * @param c 32-bit code point
+ * @return true or false
+ * @draft ICU 78
+ * @see AllScalarValues
+ * @see U_IS_CODE_POINT
+ */
+#define U_IS_SCALAR_VALUE(c) ((uint32_t)(c)<0xd800 || (0xe000<=(c) && (c)<=0x10ffff))
+
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Is this code point a Unicode noncharacter?
+ * https://www.unicode.org/glossary/#noncharacter
+ *
  * @param c 32-bit code point
  * @return true or false
  * @stable ICU 2.4
@@ -150,7 +181,7 @@
  */
 #define U_IS_UNICODE_CHAR(c) \
     ((uint32_t)(c)<0xd800 || \
-        (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
+        (0xe000<=(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
 
 /**
  * Is this code point a BMP code point (U+0000..U+ffff)?

diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h
@@ -170,7 +170,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  * @return true or false
  * @stable ICU 2.4
  */
-#define U8_IS_SINGLE(c) (((c)&0x80)==0)
+#define U8_IS_SINGLE(c) ((int8_t)(c)>=0)
 
 /**
  * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
@@ -214,6 +214,32 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
  */
 #define U8_MAX_LENGTH 4
 
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Returns the length of a well-formed UTF-8 byte sequence according to its lead byte.
+ * Returns 1 for 0..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+ * @return 1..4
+ * @draft ICU 78
+ */
+#define U8_LENGTH_FROM_LEAD_BYTE(leadByte) (U8_COUNT_TRAIL_BYTES(leadByte) + 1)
+
+/**
+ * Returns the length of a well-formed UTF-8 byte sequence according to its lead byte.
+ * Returns 1 for 0..0xc1. Undefined for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+ * @return 1..4
+ * @draft ICU 78
+ */
+#define U8_LENGTH_FROM_LEAD_BYTE_UNSAFE(leadByte) (U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) + 1)
+
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Get a code point from a string at a random-access offset,
  * without changing the offset.

diff --git a/icu4c/source/common/unicode/utfiterator.h b/icu4c/source/common/unicode/utfiterator.h
@@ -232,8 +232,112 @@ template<typename Range>
 constexpr bool range = range_type<Range>::value;
 
 #endif
+
+/** @internal */
+template<typename CP32, bool skipSurrogates>
+class CodePointsIterator {
+    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
+public:
+    /** C++ iterator boilerplate @internal */
+    using value_type = CP32;
+    /** C++ iterator boilerplate @internal */
+    using reference = value_type;
+    /** C++ iterator boilerplate @internal */
+    using pointer = CP32 *;
+    /** C++ iterator boilerplate @internal */
+    using difference_type = int32_t;
+    /** C++ iterator boilerplate @internal */
+    using iterator_category = std::forward_iterator_tag;
+
+    /** @internal */
+    inline CodePointsIterator(CP32 c) : c_(c) {}
+    /** @internal */
+    inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
+    /** @internal */
+    inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
+    /** @internal */
+    inline CP32 operator*() const { return c_; }
+    /** @internal */
+    inline CodePointsIterator &operator++() {  // pre-increment
+        ++c_;
+        if (skipSurrogates && c_ == 0xd800) {
+            c_ = 0xe000;
+        }
+        return *this;
+    }
+    /** @internal */
+    inline CodePointsIterator operator++(int) {  // post-increment
+        CodePointsIterator result(*this);
+        ++(*this);
+        return result;
+    }
+
+private:
+    CP32 c_;
+};
+
 }  // namespace prv
 
+/**
+ * A C++ "range" over all Unicode code points U+0000..U+10FFFF.
+ * https://www.unicode.org/glossary/#code_point
+ *
+ * Intended for test and builder code.
+ *
+ * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
+ * @draft ICU 78
+ * @see U_IS_CODE_POINT
+ */
+template<typename CP32>
+class AllCodePoints {
+    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
+public:
+    /** Constructor. @draft ICU 78 */
+    AllCodePoints() {}
+    /**
+     * @return an iterator over all Unicode code points.
+     *     The iterator returns CP32 integers.
+     * @draft ICU 78
+     */
+    auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
+    /**
+     * @return an exclusive-end iterator over all Unicode code points.
+     * @draft ICU 78
+     */
+    auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
+};
+
+/**
+ * A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
+ * That is, all code points except surrogates.
+ * Only scalar values can be represented in well-formed UTF-8/16/32.
+ * https://www.unicode.org/glossary/#unicode_scalar_value
+ *
+ * Intended for test and builder code.
+ *
+ * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
+ * @draft ICU 78
+ * @see U_IS_SCALAR_VALUE
+ */
+template<typename CP32>
+class AllScalarValues {
+    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
+public:
+    /** Constructor. @draft ICU 78 */
+    AllScalarValues() {}
+    /**
+     * @return an iterator over all Unicode scalar values.
+     *     The iterator returns CP32 integers.
+     * @draft ICU 78
+     */
+    auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
+    /**
+     * @return an exclusive-end iterator over all Unicode scalar values.
+     * @draft ICU 78
+     */
+    auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
+};
+
 /**
  * Result of decoding a code unit sequence for one code point.
  * Returned from non-validating Unicode string code point iterators.