Skip to content

Commit 11a337f

Browse files
jagermanwjakob
authored andcommitted
Unicode fixes and docs (#624)
* Propagate unicode conversion failure If returning a std::string with invalid utf-8 data, we currently fail with an uninformative TypeError instead of propagating the UnicodeDecodeError that Python sets on failure. * Add support for u16/u32strings and literals This adds support for wchar{16,32}_t character literals and the associated std::u{16,32}string types. It also folds the character/string conversion into a single type_caster template, since the type casters for string and wstring were mostly the same anyway. * Added too-long and too-big character conversion errors With this commit, when casting to a single character, as opposed to a C-style string, we make sure the input wasn't a multi-character string or a single character with codepoint too large for the character type. This also changes the character cast op to CharT instead of CharT& (we need to be able to return a temporary decoded char value, but also because there's little gained by bothering with an lvalue return here). Finally it changes the char caster to 'has-a-string-caster' instead of 'is-a-string-caster' because, with the cast_op change above, there's nothing at all gained from inheritance. This also lets us remove the `success` from the string caster (which was only there for the char caster) into the char caster itself. (I also renamed it to 'none' and inverted its value to better reflect its purpose). The None -> nullptr loading also now takes place only under a `convert = true` load pass. Although it's unlikely that a function taking a char also has overloads that can take a None, it seems marginally more correct to treat it as a conversion. This commit simplifies the size assumptions about character sizes with static_asserts to back them up.
1 parent ada763b commit 11a337f

File tree

5 files changed

+263
-87
lines changed

5 files changed

+263
-87
lines changed

docs/advanced/cast/overview.rst

+12
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,26 @@ as arguments and return values, refer to the section on binding :ref:`classes`.
9494
+------------------------------------+---------------------------+-------------------------------+
9595
| ``char`` | Character literal | :file:`pybind11/pybind11.h` |
9696
+------------------------------------+---------------------------+-------------------------------+
97+
| ``char16_t`` | UTF-16 character literal | :file:`pybind11/pybind11.h` |
98+
+------------------------------------+---------------------------+-------------------------------+
99+
| ``char32_t`` | UTF-32 character literal | :file:`pybind11/pybind11.h` |
100+
+------------------------------------+---------------------------+-------------------------------+
97101
| ``wchar_t`` | Wide character literal | :file:`pybind11/pybind11.h` |
98102
+------------------------------------+---------------------------+-------------------------------+
99103
| ``const char *`` | UTF-8 string literal | :file:`pybind11/pybind11.h` |
100104
+------------------------------------+---------------------------+-------------------------------+
105+
| ``const char16_t *`` | UTF-16 string literal | :file:`pybind11/pybind11.h` |
106+
+------------------------------------+---------------------------+-------------------------------+
107+
| ``const char32_t *`` | UTF-32 string literal | :file:`pybind11/pybind11.h` |
108+
+------------------------------------+---------------------------+-------------------------------+
101109
| ``const wchar_t *`` | Wide string literal | :file:`pybind11/pybind11.h` |
102110
+------------------------------------+---------------------------+-------------------------------+
103111
| ``std::string`` | STL dynamic UTF-8 string | :file:`pybind11/pybind11.h` |
104112
+------------------------------------+---------------------------+-------------------------------+
113+
| ``std::u16string`` | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h` |
114+
+------------------------------------+---------------------------+-------------------------------+
115+
| ``std::u32string`` | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h` |
116+
+------------------------------------+---------------------------+-------------------------------+
105117
| ``std::wstring`` | STL dynamic wide string | :file:`pybind11/pybind11.h` |
106118
+------------------------------------+---------------------------+-------------------------------+
107119
| ``std::pair<T1, T2>`` | Pair of two custom types | :file:`pybind11/pybind11.h` |

include/pybind11/cast.h

+116-87
Original file line numberDiff line numberDiff line change
@@ -471,8 +471,15 @@ template <typename type> class type_caster<std::reference_wrapper<type>> : publi
471471
template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>
472472

473473

474+
template <typename CharT> using is_std_char_type = any_of<
475+
std::is_same<CharT, char>, /* std::string */
476+
std::is_same<CharT, char16_t>, /* std::u16string */
477+
std::is_same<CharT, char32_t>, /* std::u32string */
478+
std::is_same<CharT, wchar_t> /* std::wstring */
479+
>;
480+
474481
template <typename T>
475-
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value>> {
482+
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
476483
using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
477484
using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
478485
using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
@@ -617,122 +624,144 @@ template <> class type_caster<bool> {
617624
PYBIND11_TYPE_CASTER(bool, _("bool"));
618625
};
619626

620-
template <> class type_caster<std::string> {
621-
public:
622-
bool load(handle src, bool) {
623-
object temp;
624-
handle load_src = src;
625-
if (!src) {
626-
return false;
627-
} else if (PyUnicode_Check(load_src.ptr())) {
628-
temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(load_src.ptr()));
629-
if (!temp) { PyErr_Clear(); return false; } // UnicodeEncodeError
630-
load_src = temp;
631-
}
632-
char *buffer;
633-
ssize_t length;
634-
int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(load_src.ptr(), &buffer, &length);
635-
if (err == -1) { PyErr_Clear(); return false; } // TypeError
636-
value = std::string(buffer, (size_t) length);
637-
success = true;
638-
return true;
639-
}
640-
641-
static handle cast(const std::string &src, return_value_policy /* policy */, handle /* parent */) {
642-
return PyUnicode_FromStringAndSize(src.c_str(), (ssize_t) src.length());
643-
}
644-
645-
PYBIND11_TYPE_CASTER(std::string, _(PYBIND11_STRING_NAME));
646-
protected:
647-
bool success = false;
648-
};
627+
// Helper class for UTF-{8,16,32} C++ stl strings:
628+
template <typename CharT, class Traits, class Allocator>
629+
struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>> {
630+
// Simplify life by being able to assume standard char sizes (the standard only guarantees
631+
// minimums), but Python requires exact sizes
632+
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
633+
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
634+
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
635+
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
636+
static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
637+
"Unsupported wchar_t size != 2/4");
638+
static constexpr size_t UTF_N = 8 * sizeof(CharT);
639+
static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32";
640+
641+
using StringType = std::basic_string<CharT, Traits, Allocator>;
649642

650-
template <> class type_caster<std::wstring> {
651-
public:
652643
bool load(handle src, bool) {
644+
#if PY_VERSION_MAJOR < 3
653645
object temp;
646+
#endif
654647
handle load_src = src;
655648
if (!src) {
656649
return false;
657650
} else if (!PyUnicode_Check(load_src.ptr())) {
651+
#if PY_VERSION_MAJOR >= 3
652+
return false;
653+
// The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
654+
#else
658655
temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
659656
if (!temp) { PyErr_Clear(); return false; }
660657
load_src = temp;
661-
}
662-
wchar_t *buffer = nullptr;
663-
ssize_t length = -1;
664-
#if PY_MAJOR_VERSION >= 3
665-
buffer = PyUnicode_AsWideCharString(load_src.ptr(), &length);
666-
#else
667-
temp = reinterpret_steal<object>(PyUnicode_AsEncodedString(
668-
load_src.ptr(), sizeof(wchar_t) == sizeof(short)
669-
? "utf16" : "utf32", nullptr));
670-
671-
if (temp) {
672-
int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), (char **) &buffer, &length);
673-
if (err == -1) { buffer = nullptr; } // TypeError
674-
length = length / (ssize_t) sizeof(wchar_t) - 1; ++buffer; // Skip BOM
675-
}
676658
#endif
677-
if (!buffer) { PyErr_Clear(); return false; }
678-
value = std::wstring(buffer, (size_t) length);
679-
success = true;
659+
}
660+
661+
object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
662+
load_src.ptr(), encoding, nullptr));
663+
if (!utfNbytes) { PyErr_Clear(); return false; }
664+
665+
const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
666+
size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
667+
if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
668+
value = StringType(buffer, length);
680669
return true;
681670
}
682671

683-
static handle cast(const std::wstring &src, return_value_policy /* policy */, handle /* parent */) {
684-
return PyUnicode_FromWideChar(src.c_str(), (ssize_t) src.length());
672+
static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
673+
const char *buffer = reinterpret_cast<const char *>(src.c_str());
674+
ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
675+
handle s = PyUnicode_Decode(buffer, nbytes, encoding, nullptr);
676+
if (!s) throw error_already_set();
677+
return s;
685678
}
686679

687-
PYBIND11_TYPE_CASTER(std::wstring, _(PYBIND11_STRING_NAME));
688-
protected:
689-
bool success = false;
680+
PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
690681
};
691682

692-
template <> class type_caster<char> : public type_caster<std::string> {
683+
// Type caster for C-style strings. We basically use a std::string type caster, but also add the
684+
// ability to use None as a nullptr char* (which the string caster doesn't allow).
685+
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
686+
using StringType = std::basic_string<CharT>;
687+
using StringCaster = type_caster<StringType>;
688+
StringCaster str_caster;
689+
bool none = false;
693690
public:
694691
bool load(handle src, bool convert) {
695-
if (src.is_none()) return true;
696-
return type_caster<std::string>::load(src, convert);
697-
}
698-
699-
static handle cast(const char *src, return_value_policy /* policy */, handle /* parent */) {
700-
if (src == nullptr) return none().inc_ref();
701-
return PyUnicode_FromString(src);
692+
if (!src) return false;
693+
if (src.is_none()) {
694+
// Defer accepting None to other overloads (if we aren't in convert mode):
695+
if (!convert) return false;
696+
none = true;
697+
return true;
698+
}
699+
return str_caster.load(src, convert);
702700
}
703701

704-
static handle cast(char src, return_value_policy /* policy */, handle /* parent */) {
705-
char str[2] = { src, '\0' };
706-
return PyUnicode_DecodeLatin1(str, 1, nullptr);
702+
static handle cast(const CharT *src, return_value_policy policy, handle parent) {
703+
if (src == nullptr) return pybind11::none().inc_ref();
704+
return StringCaster::cast(StringType(src), policy, parent);
707705
}
708706

709-
operator char*() { return success ? const_cast<char *>(value.c_str()) : nullptr; }
710-
operator char&() { return value[0]; }
711-
712-
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
713-
};
707+
static handle cast(CharT src, return_value_policy policy, handle parent) {
708+
if (std::is_same<char, CharT>::value) {
709+
handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
710+
if (!s) throw error_already_set();
711+
return s;
712+
}
713+
return StringCaster::cast(StringType(1, src), policy, parent);
714+
}
715+
716+
operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
717+
operator CharT() {
718+
if (none)
719+
throw value_error("Cannot convert None to a character");
720+
721+
auto &value = static_cast<StringType &>(str_caster);
722+
size_t str_len = value.size();
723+
if (str_len == 0)
724+
throw value_error("Cannot convert empty string to a character");
725+
726+
// If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
727+
// is too high, and one for multiple unicode characters (caught later), so we need to figure
728+
// out how long the first encoded character is in bytes to distinguish between these two
729+
// errors. We also allow want to allow unicode characters U+0080 through U+00FF, as those
730+
// can fit into a single char value.
731+
if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
732+
unsigned char v0 = static_cast<unsigned char>(value[0]);
733+
size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
734+
(v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
735+
(v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
736+
4; // 0b11110xxx - start of 4-byte sequence
737+
738+
if (char0_bytes == str_len) {
739+
// If we have a 128-255 value, we can decode it into a single char:
740+
if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
741+
return static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
742+
}
743+
// Otherwise we have a single character, but it's > U+00FF
744+
throw value_error("Character code point not in range(0x100)");
745+
}
746+
}
714747

715-
template <> class type_caster<wchar_t> : public type_caster<std::wstring> {
716-
public:
717-
bool load(handle src, bool convert) {
718-
if (src.is_none()) return true;
719-
return type_caster<std::wstring>::load(src, convert);
720-
}
748+
// UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
749+
// surrogate pair with total length 2 instantly indicates a range error (but not a "your
750+
// string was too long" error).
751+
else if (StringCaster::UTF_N == 16 && str_len == 2) {
752+
char16_t v0 = static_cast<char16_t>(value[0]);
753+
if (v0 >= 0xD800 && v0 < 0xE000)
754+
throw value_error("Character code point not in range(0x10000)");
755+
}
721756

722-
static handle cast(const wchar_t *src, return_value_policy /* policy */, handle /* parent */) {
723-
if (src == nullptr) return none().inc_ref();
724-
return PyUnicode_FromWideChar(src, (ssize_t) wcslen(src));
725-
}
757+
if (str_len != 1)
758+
throw value_error("Expected a character, but multi-character string found");
726759

727-
static handle cast(wchar_t src, return_value_policy /* policy */, handle /* parent */) {
728-
wchar_t wstr[2] = { src, L'\0' };
729-
return PyUnicode_FromWideChar(wstr, 1);
760+
return value[0];
730761
}
731762

732-
operator wchar_t*() { return success ? const_cast<wchar_t *>(value.c_str()) : nullptr; }
733-
operator wchar_t&() { return value[0]; }
734-
735763
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
764+
template <typename _T> using cast_op_type = typename std::remove_reference<pybind11::detail::cast_op_type<_T>>::type;
736765
};
737766

738767
template <typename T1, typename T2> class type_caster<std::pair<T1, T2>> {

include/pybind11/common.h

+2
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
112112
#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
113113
#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
114+
#define PYBIND11_BYTES_SIZE PyBytes_Size
114115
#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
115116
#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
116117
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) PyLong_AsUnsignedLongLong(o)
@@ -129,6 +130,7 @@
129130
#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
130131
#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
131132
#define PYBIND11_BYTES_AS_STRING PyString_AsString
133+
#define PYBIND11_BYTES_SIZE PyString_Size
132134
#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
133135
#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
134136
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) (PyInt_Check(o) ? (unsigned long long) PyLong_AsUnsignedLong(o) : PyLong_AsUnsignedLongLong(o))

tests/test_python_types.cpp

+42
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
# include <fcntl.h>
1818
#endif
1919

20+
#if defined(_MSC_VER)
21+
# pragma warning(push)
22+
# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
23+
#endif
24+
2025
class ExamplePythonTypes {
2126
public:
2227
static ExamplePythonTypes *new_instance() {
@@ -426,4 +431,41 @@ test_initializer python_types([](py::module &m) {
426431
"l"_a=l
427432
);
428433
});
434+
435+
// Some test characters in utf16 and utf32 encodings. The last one (the 𝐀) contains a null byte
436+
char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /**/, cake32 = 0x1f382 /*🎂*/, mathbfA32 = 0x1d400 /*𝐀*/;
437+
char16_t b16 = 0x62 /*b*/, z16 = 0x7a, ib16 = 0x203d, cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
438+
std::wstring wstr;
439+
wstr.push_back(0x61); // a
440+
wstr.push_back(0x2e18); //
441+
if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
442+
else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
443+
wstr.push_back(0x7a); // z
444+
445+
m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
446+
m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
447+
m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
448+
m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
449+
m.def("bad_utf8_string", []() { return std::string("abc\xd0" "def"); });
450+
m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
451+
// Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
452+
if (PY_MAJOR_VERSION >= 3)
453+
m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
454+
if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
455+
m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
456+
m.def("u8_Z", []() -> char { return 'Z'; });
457+
m.def("u8_eacute", []() -> char { return '\xe9'; });
458+
m.def("u16_ibang", [=]() -> char16_t { return ib16; });
459+
m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
460+
m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
461+
462+
m.attr("wchar_size") = py::cast(sizeof(wchar_t));
463+
m.def("ord_char", [](char c) -> int { return static_cast<unsigned char>(c); });
464+
m.def("ord_char16", [](char16_t c) -> uint16_t { return c; });
465+
m.def("ord_char32", [](char32_t c) -> uint32_t { return c; });
466+
m.def("ord_wchar", [](wchar_t c) -> int { return c; });
429467
});
468+
469+
#if defined(_MSC_VER)
470+
# pragma warning(pop)
471+
#endif

0 commit comments

Comments
 (0)