Skip to content

Commit 24df2d9

Browse files
committed
Added too-long and too-big character conversion errors
With this commit, when casting to a single character, as opposed to a C-style string, we make sure the input wasn't a multi-character string or a single character with codepoint too large for the character type. This also changes the character cast op to CharT instead of CharT& (we need to be able to return a temporary decoded char value, but also because there's little gained by bothering with an lvalue return here). Finally it changes the char caster to 'has-a-string-caster' instead of 'is-a-string-caster' because, with the cast_op change above, there's nothing at all gained from inheritance. This also lets us remove the `success` from the string caster (which was only there for the char caster) into the char caster itself. (I also renamed it to 'none' and inverted its value to better reflect its purpose). The None -> nullptr loading also now takes place only under a `convert = true` load pass. Although it's unlikely that a function taking a char also has overloads that can take a None, it seems marginally more correct to treat it as a conversion. This commit simplifies the size assumptions about character sizes with static_asserts to back them up.
1 parent d931881 commit 24df2d9

File tree

3 files changed

+138
-24
lines changed

3 files changed

+138
-24
lines changed

include/pybind11/cast.h

Lines changed: 78 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -624,33 +624,38 @@ template <> class type_caster<bool> {
624624
PYBIND11_TYPE_CASTER(bool, _("bool"));
625625
};
626626

627-
// Helper class for UTF-{8,16,32} strings:
627+
// Helper class for UTF-{8,16,32} C++ stl strings:
628628
template <typename CharT, class Traits, class Allocator>
629629
struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>> {
630-
static constexpr unsigned int UTF_N =
631-
std::is_same<CharT, char>::value ? 8 :
632-
std::is_same<CharT, char16_t>::value ? 16 :
633-
std::is_same<CharT, char32_t>::value ? 32 :
634-
(sizeof(CharT) == 2 ? 16 : 32); /* std::wstring is UTF-16 on Windows, UTF-32 everywhere else */
635-
630+
// Simplify life by being able to assume standard char sizes (the standard only guarantees
631+
// minimums), but Python requires exact sizes
632+
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
633+
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
634+
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
635+
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
636+
static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
637+
"Unsupported wchar_t size != 2/4");
638+
static constexpr size_t UTF_N = 8 * sizeof(CharT);
636639
static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32";
637640

638-
// C++ only requires char/char16_t/char32_t to be at least 8/16/32 bits, but Python's encoding
639-
// assumes exactly 1/2/4 bytes:
640-
static_assert(sizeof(CharT) == UTF_N / 8,
641-
"Internal error: string type_caster requires 1/2/4-sized character types");
642-
643641
using StringType = std::basic_string<CharT, Traits, Allocator>;
644642

645643
bool load(handle src, bool) {
644+
#if PY_VERSION_MAJOR < 3
646645
object temp;
646+
#endif
647647
handle load_src = src;
648648
if (!src) {
649649
return false;
650650
} else if (!PyUnicode_Check(load_src.ptr())) {
651+
#if PY_VERSION_MAJOR >= 3
652+
return false;
653+
// The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
654+
#else
651655
temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
652656
if (!temp) { PyErr_Clear(); return false; }
653657
load_src = temp;
658+
#endif
654659
}
655660

656661
object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
@@ -661,7 +666,6 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
661666
size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
662667
if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
663668
value = StringType(buffer, length);
664-
success = true;
665669
return true;
666670
}
667671

@@ -674,24 +678,29 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
674678
}
675679

676680
PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
677-
protected:
678-
bool success = false;
679681
};
680682

681-
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>>
682-
: type_caster<std::basic_string<CharT>> {
683+
// Type caster for C-style strings. We basically use a std::string type caster, but also add the
684+
// ability to use None as a nullptr char* (which the string caster doesn't allow).
685+
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
683686
using StringType = std::basic_string<CharT>;
684687
using StringCaster = type_caster<StringType>;
685-
using StringCaster::success;
686-
using StringCaster::value;
688+
StringCaster str_caster;
689+
bool none = false;
687690
public:
688691
bool load(handle src, bool convert) {
689-
if (src.is_none()) return true;
690-
return StringCaster::load(src, convert);
692+
if (!src) return false;
693+
if (src.is_none()) {
694+
// Defer accepting None to other overloads (if we aren't in convert mode):
695+
if (!convert) return false;
696+
none = true;
697+
return true;
698+
}
699+
return str_caster.load(src, convert);
691700
}
692701

693702
static handle cast(const CharT *src, return_value_policy policy, handle parent) {
694-
if (src == nullptr) return none().inc_ref();
703+
if (src == nullptr) return pybind11::none().inc_ref();
695704
return StringCaster::cast(StringType(src), policy, parent);
696705
}
697706

@@ -704,10 +713,55 @@ template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type
704713
return StringCaster::cast(StringType(1, src), policy, parent);
705714
}
706715

707-
operator CharT*() { return success ? const_cast<CharT *>(value.c_str()) : nullptr; }
708-
operator CharT&() { return value[0]; }
716+
operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
717+
operator CharT() {
718+
if (none)
719+
throw value_error("Cannot convert None to a character");
720+
721+
auto &value = static_cast<StringType &>(str_caster);
722+
size_t str_len = value.size();
723+
if (str_len == 0)
724+
throw value_error("Cannot convert empty string to a character");
725+
726+
// If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
727+
// is too high, and one for multiple unicode characters (caught later), so we need to figure
728+
// out how long the first encoded character is in bytes to distinguish between these two
729+
// errors. We also allow want to allow unicode characters U+0080 through U+00FF, as those
730+
// can fit into a single char value.
731+
if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
732+
unsigned char v0 = static_cast<unsigned char>(value[0]);
733+
size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
734+
(v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
735+
(v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
736+
4; // 0b11110xxx - start of 4-byte sequence
737+
738+
if (char0_bytes == str_len) {
739+
// If we have a 128-255 value, we can decode it into a single char:
740+
if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
741+
return static_cast<char>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
742+
}
743+
// Otherwise we have a single character, but it's > U+00FF
744+
throw value_error("Character code point not in range(0x100)");
745+
}
746+
}
747+
748+
// UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
749+
// surrogate pair with total length 2 instantly indicates a range error (but not a "your
750+
// string was too long" error).
751+
else if (StringCaster::UTF_N == 16 && str_len == 2) {
752+
char16_t v0 = static_cast<char16_t>(value[0]);
753+
if (v0 >= 0xD800 && v0 < 0xE000)
754+
throw value_error("Character code point not in range(0x10000)");
755+
}
756+
757+
if (str_len != 1)
758+
throw value_error("Expected a character, but multi-character string found");
759+
760+
return value[0];
761+
}
709762

710763
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
764+
template <typename _T> using cast_op_type = typename std::remove_reference<pybind11::detail::cast_op_type<_T>>::type;
711765
};
712766

713767
template <typename T1, typename T2> class type_caster<std::pair<T1, T2>> {

tests/test_python_types.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,12 @@ test_initializer python_types([](py::module &m) {
458458
m.def("u16_ibang", [=]() -> char16_t { return ib16; });
459459
m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
460460
m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
461+
462+
m.attr("wchar_size") = py::cast(sizeof(wchar_t));
463+
m.def("ord_char", [](char c) { return 0 + static_cast<unsigned char>(c); });
464+
m.def("ord_char16", [](char16_t c) { return 0 + c; });
465+
m.def("ord_char32", [](char32_t c) { return 0 + c; });
466+
m.def("ord_wchar", [](wchar_t c) { return 0 + c; });
461467
});
462468

463469
#if defined(_MSC_VER)

tests/test_python_types.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,3 +446,57 @@ def test_unicode_conversion():
446446
assert u16_ibang() == u'‽'
447447
assert u32_mathbfA() == u'𝐀'
448448
assert wchar_heart() == u'♥'
449+
450+
451+
def test_single_char_arguments():
452+
"""Tests failures for passing invalid inputs to char-accepting functions"""
453+
from pybind11_tests import ord_char, ord_char16, ord_char32, ord_wchar, wchar_size
454+
455+
def toobig_message(r): return "Character code point not in range({0:#x})".format(r)
456+
toolong_message = "Expected a character, but multi-character string found"
457+
458+
assert ord_char(u'a') == 0x61 # simple ASCII
459+
assert ord_char(u'é') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char
460+
with pytest.raises(ValueError) as excinfo:
461+
assert ord_char(u'Ā') == 0x100 # requires 2 bytes, doesn't fit in a char
462+
assert str(excinfo.value) == toobig_message(0x100)
463+
with pytest.raises(ValueError) as excinfo:
464+
assert ord_char(u'ab')
465+
assert str(excinfo.value) == toolong_message
466+
467+
assert ord_char16(u'a') == 0x61
468+
assert ord_char16(u'é') == 0xE9
469+
assert ord_char16(u'Ā') == 0x100
470+
assert ord_char16(u'‽') == 0x203d
471+
assert ord_char16(u'♥') == 0x2665
472+
with pytest.raises(ValueError) as excinfo:
473+
assert ord_char16(u'🎂') == 0x1F382 # requires surrogate pair
474+
assert str(excinfo.value) == toobig_message(0x10000)
475+
with pytest.raises(ValueError) as excinfo:
476+
assert ord_char16(u'aa')
477+
assert str(excinfo.value) == toolong_message
478+
479+
assert ord_char32(u'a') == 0x61
480+
assert ord_char32(u'é') == 0xE9
481+
assert ord_char32(u'Ā') == 0x100
482+
assert ord_char32(u'‽') == 0x203d
483+
assert ord_char32(u'♥') == 0x2665
484+
assert ord_char32(u'🎂') == 0x1F382
485+
with pytest.raises(ValueError) as excinfo:
486+
assert ord_char32(u'aa')
487+
assert str(excinfo.value) == toolong_message
488+
489+
assert ord_wchar(u'a') == 0x61
490+
assert ord_wchar(u'é') == 0xE9
491+
assert ord_wchar(u'Ā') == 0x100
492+
assert ord_wchar(u'‽') == 0x203d
493+
assert ord_wchar(u'♥') == 0x2665
494+
if wchar_size == 2:
495+
with pytest.raises(ValueError) as excinfo:
496+
assert ord_wchar(u'🎂') == 0x1F382 # requires surrogate pair
497+
assert str(excinfo.value) == toobig_message(0x10000)
498+
else:
499+
assert ord_wchar(u'🎂') == 0x1F382
500+
with pytest.raises(ValueError) as excinfo:
501+
assert ord_wchar(u'aa')
502+
assert str(excinfo.value) == toolong_message

0 commit comments

Comments
 (0)