Skip to content

Commit a0a98d4

Browse files
committed
Split up single char and c-style string conversion
This moves the `char *` caster code into a new `type_caster<char*>` and repurposes the `type_caster<char>` for doing only single-character casting (plus the same for the other standard char types). For common types, treating `T` and `T*` the same makes sense, but `char` and `char*` really don't work being treated the same at all: a `char*` is fundamentally different in use compared to a char. This new caster is done by adding a conditional to `make_caster` that specifically adds the pointer when casting a `char *` (or similar) instead of throwing away the pointer (that is, it strips away everything down to the intrinsic_t, but then re-adds the pointer for these C-style string types). In the end, this leaves us now with three sets of related casters instead of two: - `std::string`/`std::wstring`/`std::uNNstring` - `char*`/`wchar_t*`/`charNN_t*` - `char`/`wchar_t`/`charNN_t` where the second ones are just wrappers around the first one, but with extra support for nullptr <-> None conversion. The third one is all new: it handles loading and casting single characters, with load failures for attempting to load a character too large for the associated type, and works on unicode codepoint inputs, not individual codes.
1 parent abd213d commit a0a98d4

File tree

2 files changed

+106
-44
lines changed

2 files changed

+106
-44
lines changed

include/pybind11/cast.h

+104-44
Original file line numberDiff line numberDiff line change
@@ -438,8 +438,25 @@ template <typename type> class type_caster_base : public type_caster_generic {
438438
static Constructor make_move_constructor(...) { return nullptr; }
439439
};
440440

441+
template <typename CharT> using is_std_char_type = any_of<
442+
std::is_same<CharT, char>,
443+
std::is_same<CharT, char16_t>,
444+
std::is_same<CharT, char32_t>,
445+
std::is_same<CharT, wchar_t>
446+
>;
447+
448+
// Detects whether a type is a c-style string (char*, wchar_t*, char16_t*, char32_t*, or fixed size
449+
// array of any of those); if so, make_caster<> will keep the pointer to send it to the CharT*
450+
// type_caster.
451+
template <typename T> using is_c_str_caster = all_of<
452+
satisfies_any_of<typename std::remove_reference<T>::type, std::is_pointer, std::is_array>,
453+
is_std_char_type<intrinsic_t<T>>
454+
>;
455+
441456
template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
442-
template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
457+
template <typename type> using make_caster = type_caster<
458+
conditional_t<is_c_str_caster<type>::value, typename std::add_pointer<intrinsic_t<type>>::type,
459+
intrinsic_t<type>>>;
443460

444461
// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
445462
template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
@@ -471,13 +488,6 @@ template <typename type> class type_caster<std::reference_wrapper<type>> : publi
471488
template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>
472489

473490

474-
template <typename CharT> using is_std_char_type = any_of<
475-
std::is_same<CharT, char>, /* std::string */
476-
std::is_same<CharT, char16_t>, /* std::u16string */
477-
std::is_same<CharT, char32_t>, /* std::u32string */
478-
std::is_same<CharT, wchar_t> /* std::wstring */
479-
>;
480-
481491
template <typename T>
482492
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
483493
typedef typename std::conditional<sizeof(T) <= sizeof(long), long, long long>::type _py_type_0;
@@ -620,22 +630,21 @@ template <> class type_caster<bool> {
620630
PYBIND11_TYPE_CASTER(bool, _("bool"));
621631
};
622632

623-
// Helper class for UTF-{8,16,32} strings:
633+
// Type caster for for UTF-{8,16,32}-encoded std::strings. This also provides most of the
634+
// functionality for a c-style char*/wchar_t*/etc. string.
624635
template <typename CharT, class Traits, class Allocator>
625636
struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>> {
626-
static constexpr unsigned int UTF_N =
627-
std::is_same<CharT, char>::value ? 8 :
628-
std::is_same<CharT, char16_t>::value ? 16 :
629-
std::is_same<CharT, char32_t>::value ? 32 :
630-
(sizeof(CharT) == 2 ? 16 : 32); /* std::wstring is UTF-16 on Windows, UTF-32 everywhere else */
631-
637+
// Simplify life by being able to assume standard char sizes (the standard only guarantees
638+
// minimums), but Python requires exact sizes
639+
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
640+
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
641+
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
642+
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
643+
static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
644+
"Unsupported wchar_t size != 2/4");
645+
static constexpr size_t UTF_N = 8 * sizeof(CharT);
632646
static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32";
633647

634-
// C++ only requires char/char16_t/char32_t to be at least 8/16/32 bits, but Python's encoding
635-
// assumes exactly 1/2/4 bytes:
636-
static_assert(sizeof(CharT) == UTF_N / 8,
637-
"Internal error: string type_caster requires 1/2/4-sized character types");
638-
639648
using StringType = std::basic_string<CharT, Traits, Allocator>;
640649

641650
bool load(handle src, bool) {
@@ -644,9 +653,14 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
644653
if (!src) {
645654
return false;
646655
} else if (!PyUnicode_Check(load_src.ptr())) {
656+
#if PY_VERSION_MAJOR >= 3
657+
return false;
658+
// The below is a guaranteed failure in Python3 when PyUnicode_Check returns false
659+
#else
647660
temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
648661
if (!temp) { PyErr_Clear(); return false; }
649662
load_src = temp;
663+
#endif
650664
}
651665

652666
object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
@@ -657,7 +671,6 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
657671
size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
658672
if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
659673
value = StringType(buffer, length);
660-
success = true;
661674
return true;
662675
}
663676

@@ -670,40 +683,87 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
670683
}
671684

672685
PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
673-
protected:
674-
bool success = false;
675686
};
676687

677-
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>>
678-
: type_caster<std::basic_string<CharT>> {
679-
using StringType = std::basic_string<CharT>;
680-
using StringCaster = type_caster<StringType>;
681-
using StringCaster::success;
682-
using StringCaster::value;
688+
// Type caster for c-style char* (and similar multibyte string). We use a std::string (or
689+
// std::{w,u16,u32}string caster) class, above, except for one thing: we convert between nullptr
690+
// and None (in essence, this is like a C++17 std::optional<std::string>).
691+
template <typename CharT> struct type_caster<CharT*, enable_if_t<is_std_char_type<CharT>::value>>
692+
: make_caster<std::basic_string<CharT>> {
693+
private:
694+
using Super = make_caster<std::basic_string<CharT>>;
695+
bool none = false;
683696
public:
684697
bool load(handle src, bool convert) {
685-
if (src.is_none()) return true;
686-
return StringCaster::load(src, convert);
698+
if (!src) return false;
699+
if (src.is_none()) {
700+
if (!convert) return false;
701+
none = true;
702+
return true;
703+
}
704+
return Super::load(src, convert);
687705
}
706+
static handle cast(const char *src, return_value_policy policy, handle parent) {
707+
if (src == nullptr) return pybind11::none().inc_ref();
708+
return Super::cast(src, policy, parent);
709+
}
710+
operator CharT*() { return none ? nullptr : const_cast<CharT *>(Super::value.c_str()); }
688711

689-
static handle cast(const CharT *src, return_value_policy policy, handle parent) {
690-
if (src == nullptr) return none().inc_ref();
691-
return StringCaster::cast(StringType(src), policy, parent);
712+
template <typename> using cast_op_type = CharT *;
713+
};
714+
715+
// Single character caster (for standard character types)
716+
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
717+
private:
718+
static constexpr char32_t max = sizeof(CharT) == 1 ? 0xff : sizeof(CharT) == 2 ? 0xffff : 0x10ffff;
719+
720+
bool load_codepoint(char32_t codepoint) {
721+
if (codepoint > max)
722+
return false;
723+
value = std::is_signed<CharT>::value
724+
? static_cast<CharT>((typename std::make_unsigned<CharT>::type) codepoint)
725+
: (CharT) codepoint;
726+
return true;
692727
}
693728

694-
static handle cast(CharT src, return_value_policy policy, handle parent) {
695-
if (std::is_same<char, CharT>::value) {
696-
handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
697-
if (!s) throw error_already_set();
698-
return s;
699-
}
700-
return StringCaster::cast(StringType(1, src), policy, parent);
729+
public:
730+
731+
#if PY_VERSION_HEX >= 0x03030000
732+
bool load(handle src, bool) {
733+
if (!src)
734+
return false;
735+
if (!PyUnicode_Check(src.ptr()))
736+
return false;
737+
int ready = PyUnicode_READY(src.ptr());
738+
if (ready == -1) { PyErr_Clear(); return false; }
739+
if (PyUnicode_GET_LENGTH(src.ptr()) != 1)
740+
return false;
741+
return load_codepoint(PyUnicode_READ_CHAR(src.ptr(), 0));
742+
}
743+
#else
744+
// Python before PEP 393 (implemented in Python 3.3) does not give any particularly nice way to
745+
// interact with unicode *characters* (like the above code), so we go through conversion to a
746+
// UTF-32 string from which we are guaranteed to be able to read codepoints (otherwise all we
747+
// can get out of Python is the number of code units (which is really not helpful in determining
748+
// whether we have a single character or not).
749+
bool load(handle src, bool convert) {
750+
make_caster<std::u32string> strcaster;
751+
if (!strcaster.load(src, convert))
752+
return false;
753+
auto &str = strcaster.operator std::u32string&();
754+
if (str.size() != 1)
755+
return false;
756+
return load_codepoint(str[0]);
701757
}
758+
#endif
702759

703-
operator CharT*() { return success ? (CharT *) value.c_str() : nullptr; }
704-
operator CharT&() { return value[0]; }
760+
static handle cast(CharT src, return_value_policy, handle) {
761+
int codepoint = int(std::is_signed<CharT>::value ? static_cast<typename std::make_unsigned<CharT>::type>(src) : src);
762+
return PyUnicode_FromFormat("%c", (int) codepoint);
763+
}
705764

706-
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
765+
// chr()/unichr() isn't technically a type, but it should get the point across:
766+
PYBIND11_TYPE_CASTER(CharT, _(PYBIND11_CHR_NAME "(") + _<(max <= 0xffff)>(_("<=") + _<max>(), _("")) + _(")"));
707767
};
708768

709769
template <typename T1, typename T2> class type_caster<std::pair<T1, T2>> {

include/pybind11/common.h

+2
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@
117117
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) PyLong_AsUnsignedLongLong(o)
118118
#define PYBIND11_BYTES_NAME "bytes"
119119
#define PYBIND11_STRING_NAME "str"
120+
#define PYBIND11_CHR_NAME "chr"
120121
#define PYBIND11_SLICE_OBJECT PyObject
121122
#define PYBIND11_FROM_STRING PyUnicode_FromString
122123
#define PYBIND11_STR_TYPE ::pybind11::str
@@ -136,6 +137,7 @@
136137
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) (PyInt_Check(o) ? (unsigned long long) PyLong_AsUnsignedLong(o) : PyLong_AsUnsignedLongLong(o))
137138
#define PYBIND11_BYTES_NAME "str"
138139
#define PYBIND11_STRING_NAME "unicode"
140+
#define PYBIND11_CHR_NAME "unichr"
139141
#define PYBIND11_SLICE_OBJECT PySliceObject
140142
#define PYBIND11_FROM_STRING PyString_FromString
141143
#define PYBIND11_STR_TYPE ::pybind11::bytes

0 commit comments

Comments
 (0)