Skip to content

Commit 30748f8

Browse files
authored
Avoid heap allocation for function calls with a small number of args (#5824)
* Avoid heap allocation for function calls with a small number of arguments We don't have access to llvm::SmallVector or similar, but given the limited subset of the `std::vector` API that `function_call::args{,_convert}` need and the "reserve-then-fill" usage pattern, it is relatively straightforward to implement custom containers that get the job done. Seems to improves time to call the collatz function in pybind/pybind11_benchmark significantly; numbers are a little noisy but there's a clear improvement from "about 60 ns per call" to "about 45 ns per call" on my machine (M4 Max Mac), as measured with `timeit.repeat('collatz(4)', 'from pybind11_benchmark import collatz')`. * clang-tidy * more clang-tidy * clang-tidy NOLINTBEGIN/END instead of NOLINTNEXTLINE * forgot to increase inline size after removing std::variant * constexpr arg_vector_small_size, use move instead of swap to hopefully clarify second_pass_convert * rename test_embed to test_low_level * rename test_low_level to test_with_catch * Be careful to NOINLINE slow paths * rename array/vector members to iarray/hvector. Move comment per request. Add static_asserts for our untagged union implementation per request. * drop is_standard_layout assertions; see #5824 (comment)
1 parent 326b106 commit 30748f8

File tree

16 files changed

+532
-18
lines changed

16 files changed

+532
-18
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
180180
endif()
181181

182182
set(PYBIND11_HEADERS
183+
include/pybind11/detail/argument_vector.h
183184
include/pybind11/detail/class.h
184185
include/pybind11/detail/common.h
185186
include/pybind11/detail/cpp_conduit.h

include/pybind11/cast.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#pragma once
1212

13+
#include "detail/argument_vector.h"
1314
#include "detail/common.h"
1415
#include "detail/descr.h"
1516
#include "detail/native_enum_data.h"
@@ -2037,6 +2038,10 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
20372038
// forward declaration (definition in attr.h)
20382039
struct function_record;
20392040

2041+
/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
2042+
/// (16 pointers) in size.)
2043+
constexpr std::size_t arg_vector_small_size = 6;
2044+
20402045
/// Internal data associated with a single function call
20412046
struct function_call {
20422047
function_call(const function_record &f, handle p); // Implementation in attr.h
@@ -2045,10 +2050,10 @@ struct function_call {
20452050
const function_record &func;
20462051

20472052
/// Arguments passed to the function:
2048-
std::vector<handle> args;
2053+
argument_vector<arg_vector_small_size> args;
20492054

20502055
/// The `convert` value the arguments should be loaded with
2051-
std::vector<bool> args_convert;
2056+
args_convert_vector<arg_vector_small_size> args_convert;
20522057

20532058
/// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
20542059
/// present, are also in `args` but without a reference).
Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
/*
2+
pybind11/detail/argument_vector.h: small_vector-like containers to
3+
avoid heap allocation of arguments during function call dispatch.
4+
5+
Copyright (c) Meta Platforms, Inc. and affiliates.
6+
7+
All rights reserved. Use of this source code is governed by a
8+
BSD-style license that can be found in the LICENSE file.
9+
*/
10+
11+
#pragma once
12+
13+
#include <pybind11/pytypes.h>
14+
15+
#include "common.h"
16+
17+
#include <algorithm>
18+
#include <array>
19+
#include <cstddef>
20+
#include <cstdint>
21+
#include <cstring>
22+
#include <iterator>
23+
#include <type_traits>
24+
#include <utility>
25+
#include <vector>
26+
27+
PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
28+
29+
PYBIND11_WARNING_DISABLE_MSVC(4127)
30+
31+
PYBIND11_NAMESPACE_BEGIN(detail)
32+
33+
// Shared implementation utility for our small_vector-like containers.
34+
// We support C++11 and C++14, so we cannot use
35+
// std::variant. Union with the tag packed next to the inline
36+
// array's size is smaller anyway, allowing 1 extra handle of
37+
// inline storage for free. Compare the layouts (1 line per
38+
// size_t/void*, assuming a 64-bit machine):
39+
// With variant, total is N + 2 for N >= 2:
40+
// - variant tag (cannot be packed with the array size)
41+
// - array size (or first pointer of 3 in std::vector)
42+
// - N pointers of inline storage (or 2 remaining pointers of std::vector)
43+
// Custom union, total is N + 1 for N >= 3:
44+
// - variant tag & array size if applicable
45+
// - N pointers of inline storage (or 3 pointers of std::vector)
46+
//
47+
// NOTE: this is a low-level representational convenience; the two
48+
// use cases of this union are materially different and in particular
49+
// have different semantics for inline_array::size. All that is being
50+
// shared is the memory management behavior.
51+
template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
52+
union inline_array_or_vector {
53+
struct inline_array {
54+
bool is_inline = true;
55+
std::uint32_t size = 0;
56+
std::array<ArrayT, InlineSize> arr;
57+
};
58+
struct heap_vector {
59+
bool is_inline = false;
60+
std::vector<VectorT> vec;
61+
62+
heap_vector() = default;
63+
heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
64+
};
65+
66+
inline_array iarray;
67+
heap_vector hvector;
68+
69+
static_assert(std::is_trivially_move_constructible<ArrayT>::value,
70+
"ArrayT must be trivially move constructible");
71+
static_assert(std::is_trivially_destructible<ArrayT>::value,
72+
"ArrayT must be trivially destructible");
73+
74+
inline_array_or_vector() : iarray() {}
75+
~inline_array_or_vector() {
76+
if (!is_inline()) {
77+
hvector.~heap_vector();
78+
}
79+
}
80+
// Disable copy ctor and assignment.
81+
inline_array_or_vector(const inline_array_or_vector &) = delete;
82+
inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
83+
84+
inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
85+
if (rhs.is_inline()) {
86+
std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
87+
} else {
88+
new (&hvector) heap_vector(std::move(rhs.hvector));
89+
}
90+
assert(is_inline() == rhs.is_inline());
91+
}
92+
93+
inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
94+
if (this == &rhs) {
95+
return *this;
96+
}
97+
98+
if (rhs.is_inline()) {
99+
if (!is_inline()) {
100+
hvector.~heap_vector();
101+
}
102+
std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
103+
} else {
104+
if (is_inline()) {
105+
new (&hvector) heap_vector(std::move(rhs.hvector));
106+
} else {
107+
hvector = std::move(rhs.hvector);
108+
}
109+
}
110+
return *this;
111+
}
112+
113+
bool is_inline() const {
114+
// It is undefined behavior to access the inactive member of a
115+
// union directly. However, it is well-defined to reinterpret_cast any
116+
// pointer into a pointer to char and examine it as an array
117+
// of bytes. See
118+
// https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
119+
bool result = false;
120+
static_assert(offsetof(inline_array, is_inline) == 0,
121+
"untagged union implementation relies on this");
122+
static_assert(offsetof(heap_vector, is_inline) == 0,
123+
"untagged union implementation relies on this");
124+
std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
125+
return result;
126+
}
127+
};
128+
129+
// small_vector-like container to avoid heap allocation for N or fewer
130+
// arguments.
131+
template <std::size_t N>
132+
struct argument_vector {
133+
public:
134+
argument_vector() = default;
135+
136+
// Disable copy ctor and assignment.
137+
argument_vector(const argument_vector &) = delete;
138+
argument_vector &operator=(const argument_vector &) = delete;
139+
argument_vector(argument_vector &&) noexcept = default;
140+
argument_vector &operator=(argument_vector &&) noexcept = default;
141+
142+
std::size_t size() const {
143+
if (is_inline()) {
144+
return m_repr.iarray.size;
145+
}
146+
return m_repr.hvector.vec.size();
147+
}
148+
149+
handle &operator[](std::size_t idx) {
150+
assert(idx < size());
151+
if (is_inline()) {
152+
return m_repr.iarray.arr[idx];
153+
}
154+
return m_repr.hvector.vec[idx];
155+
}
156+
157+
handle operator[](std::size_t idx) const {
158+
assert(idx < size());
159+
if (is_inline()) {
160+
return m_repr.iarray.arr[idx];
161+
}
162+
return m_repr.hvector.vec[idx];
163+
}
164+
165+
void push_back(handle x) {
166+
if (is_inline()) {
167+
auto &ha = m_repr.iarray;
168+
if (ha.size == N) {
169+
move_to_heap_vector_with_reserved_size(N + 1);
170+
push_back_slow_path(x);
171+
} else {
172+
ha.arr[ha.size++] = x;
173+
}
174+
} else {
175+
push_back_slow_path(x);
176+
}
177+
}
178+
179+
template <typename Arg>
180+
void emplace_back(Arg &&x) {
181+
push_back(handle(x));
182+
}
183+
184+
void reserve(std::size_t sz) {
185+
if (is_inline()) {
186+
if (sz > N) {
187+
move_to_heap_vector_with_reserved_size(sz);
188+
}
189+
} else {
190+
reserve_slow_path(sz);
191+
}
192+
}
193+
194+
private:
195+
using repr_type = inline_array_or_vector<handle, N>;
196+
repr_type m_repr;
197+
198+
PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
199+
assert(is_inline());
200+
auto &ha = m_repr.iarray;
201+
using heap_vector = typename repr_type::heap_vector;
202+
heap_vector hv;
203+
hv.vec.reserve(reserved_size);
204+
std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
205+
new (&m_repr.hvector) heap_vector(std::move(hv));
206+
}
207+
208+
PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.hvector.vec.push_back(x); }
209+
210+
PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.hvector.vec.reserve(sz); }
211+
212+
bool is_inline() const { return m_repr.is_inline(); }
213+
};
214+
215+
// small_vector-like container to avoid heap allocation for N or fewer
216+
// arguments.
217+
template <std::size_t kRequestedInlineSize>
218+
struct args_convert_vector {
219+
private:
220+
public:
221+
args_convert_vector() = default;
222+
223+
// Disable copy ctor and assignment.
224+
args_convert_vector(const args_convert_vector &) = delete;
225+
args_convert_vector &operator=(const args_convert_vector &) = delete;
226+
args_convert_vector(args_convert_vector &&) noexcept = default;
227+
args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
228+
229+
args_convert_vector(std::size_t count, bool value) {
230+
if (count > kInlineSize) {
231+
new (&m_repr.hvector) typename repr_type::heap_vector(count, value);
232+
} else {
233+
auto &inline_arr = m_repr.iarray;
234+
inline_arr.arr.fill(value ? std::size_t(-1) : 0);
235+
inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
236+
}
237+
}
238+
239+
std::size_t size() const {
240+
if (is_inline()) {
241+
return m_repr.iarray.size;
242+
}
243+
return m_repr.hvector.vec.size();
244+
}
245+
246+
void reserve(std::size_t sz) {
247+
if (is_inline()) {
248+
if (sz > kInlineSize) {
249+
move_to_heap_vector_with_reserved_size(sz);
250+
}
251+
} else {
252+
m_repr.hvector.vec.reserve(sz);
253+
}
254+
}
255+
256+
bool operator[](std::size_t idx) const {
257+
if (is_inline()) {
258+
return inline_index(idx);
259+
}
260+
assert(idx < m_repr.hvector.vec.size());
261+
return m_repr.hvector.vec[idx];
262+
}
263+
264+
void push_back(bool b) {
265+
if (is_inline()) {
266+
auto &ha = m_repr.iarray;
267+
if (ha.size == kInlineSize) {
268+
move_to_heap_vector_with_reserved_size(kInlineSize + 1);
269+
push_back_slow_path(b);
270+
} else {
271+
assert(ha.size < kInlineSize);
272+
const auto wbi = word_and_bit_index(ha.size++);
273+
assert(wbi.word < kWords);
274+
assert(wbi.bit < kBitsPerWord);
275+
if (b) {
276+
ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
277+
} else {
278+
ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
279+
}
280+
assert(operator[](ha.size - 1) == b);
281+
}
282+
} else {
283+
push_back_slow_path(b);
284+
}
285+
}
286+
287+
void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }
288+
289+
private:
290+
struct WordAndBitIndex {
291+
std::size_t word;
292+
std::size_t bit;
293+
};
294+
295+
static WordAndBitIndex word_and_bit_index(std::size_t idx) {
296+
return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
297+
}
298+
299+
bool inline_index(std::size_t idx) const {
300+
const auto wbi = word_and_bit_index(idx);
301+
assert(wbi.word < kWords);
302+
assert(wbi.bit < kBitsPerWord);
303+
return m_repr.iarray.arr[wbi.word] & (std::size_t(1) << wbi.bit);
304+
}
305+
306+
PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
307+
auto &inline_arr = m_repr.iarray;
308+
using heap_vector = typename repr_type::heap_vector;
309+
heap_vector hv;
310+
hv.vec.reserve(reserved_size);
311+
for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
312+
hv.vec.push_back(inline_index(ii));
313+
}
314+
new (&m_repr.hvector) heap_vector(std::move(hv));
315+
}
316+
317+
PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.hvector.vec.push_back(b); }
318+
319+
static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
320+
static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
321+
static constexpr auto kInlineSize = kWords * kBitsPerWord;
322+
323+
using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
324+
repr_type m_repr;
325+
326+
bool is_inline() const { return m_repr.is_inline(); }
327+
};
328+
329+
PYBIND11_NAMESPACE_END(detail)
330+
PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)

include/pybind11/pybind11.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,13 +1048,14 @@ class cpp_function : public function {
10481048
}
10491049
#endif
10501050

1051-
std::vector<bool> second_pass_convert;
1051+
args_convert_vector<arg_vector_small_size> second_pass_convert;
10521052
if (overloaded) {
10531053
// We're in the first no-convert pass, so swap out the conversion flags for a
10541054
// set of all-false flags. If the call fails, we'll swap the flags back in for
10551055
// the conversion-allowed call below.
1056-
second_pass_convert.resize(func.nargs, false);
1057-
call.args_convert.swap(second_pass_convert);
1056+
second_pass_convert = std::move(call.args_convert);
1057+
call.args_convert
1058+
= args_convert_vector<arg_vector_small_size>(func.nargs, false);
10581059
}
10591060

10601061
// 6. Call the function.

tests/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
647647
# Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
648648
add_subdirectory(pure_cpp)
649649

650-
# Test embedding the interpreter. Provides the `cpptest` target.
651-
add_subdirectory(test_embed)
650+
# Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
651+
add_subdirectory(test_with_catch)
652652

653653
# Test CMake build using functions and targets from subdirectory or installed location
654654
add_subdirectory(test_cmake_build)

0 commit comments

Comments
 (0)