Skip to content

Commit 21f068d

Browse files
authored
gh-109587: Allow "precompiled" perf-trampolines to largely mitigate the cost of enabling perf-trampolines (#109666)
1 parent 3d2f1f0 commit 21f068d

File tree

8 files changed

+199
-10
lines changed

8 files changed

+199
-10
lines changed

Include/cpython/sysmodule.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(
2121
unsigned int code_size,
2222
const char *entry_name);
2323
PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void);
24+
PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename);
25+
PyAPI_FUNC(int) PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *);
26+
PyAPI_FUNC(int) PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable);

Include/internal/pycore_ceval_state.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct _ceval_runtime_state {
5555
struct code_arena_st *code_arena;
5656
struct trampoline_api_st trampoline_api;
5757
FILE *map_file;
58+
Py_ssize_t persist_after_fork;
5859
#else
5960
int _not_used;
6061
#endif
@@ -68,6 +69,7 @@ struct _ceval_runtime_state {
6869
{ \
6970
.status = PERF_STATUS_NO_INIT, \
7071
.extra_code_index = -1, \
72+
.persist_after_fork = 0, \
7173
}
7274
#else
7375
# define _PyEval_RUNTIME_PERF_INIT {0}

Include/sysmodule.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
2-
/* System module interface */
3-
41
#ifndef Py_SYSMODULE_H
52
#define Py_SYSMODULE_H
63
#ifdef __cplusplus

Lib/test/test_perf_profiler.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,82 @@ def baz(n):
353353
self.assertNotIn(f"py::bar:{script}", stdout)
354354
self.assertNotIn(f"py::baz:{script}", stdout)
355355

356+
def test_pre_fork_compile(self):
357+
code = """if 1:
358+
import sys
359+
import os
360+
import sysconfig
361+
from _testinternalcapi import (
362+
compile_perf_trampoline_entry,
363+
perf_trampoline_set_persist_after_fork,
364+
)
365+
366+
def foo_fork():
367+
pass
368+
369+
def bar_fork():
370+
foo_fork()
371+
372+
def foo():
373+
pass
374+
375+
def bar():
376+
foo()
377+
378+
def compile_trampolines_for_all_functions():
379+
perf_trampoline_set_persist_after_fork(1)
380+
for _, obj in globals().items():
381+
if callable(obj) and hasattr(obj, '__code__'):
382+
compile_perf_trampoline_entry(obj.__code__)
383+
384+
if __name__ == "__main__":
385+
compile_trampolines_for_all_functions()
386+
pid = os.fork()
387+
if pid == 0:
388+
print(os.getpid())
389+
bar_fork()
390+
else:
391+
bar()
392+
"""
393+
394+
with temp_dir() as script_dir:
395+
script = make_script(script_dir, "perftest", code)
396+
with subprocess.Popen(
397+
[sys.executable, "-Xperf", script],
398+
universal_newlines=True,
399+
stderr=subprocess.PIPE,
400+
stdout=subprocess.PIPE,
401+
) as process:
402+
stdout, stderr = process.communicate()
403+
404+
self.assertEqual(process.returncode, 0)
405+
self.assertNotIn("Error:", stderr)
406+
child_pid = int(stdout.strip())
407+
perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
408+
perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map")
409+
self.assertTrue(perf_file.exists())
410+
self.assertTrue(perf_child_file.exists())
411+
412+
perf_file_contents = perf_file.read_text()
413+
self.assertIn(f"py::foo:{script}", perf_file_contents)
414+
self.assertIn(f"py::bar:{script}", perf_file_contents)
415+
self.assertIn(f"py::foo_fork:{script}", perf_file_contents)
416+
self.assertIn(f"py::bar_fork:{script}", perf_file_contents)
417+
418+
child_perf_file_contents = perf_child_file.read_text()
419+
self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents)
420+
self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents)
421+
422+
# Pre-compiled perf-map entries of a forked process must be
423+
# identical in both the parent and child perf-map files.
424+
perf_file_lines = perf_file_contents.split("\n")
425+
for line in perf_file_lines:
426+
if (
427+
f"py::foo_fork:{script}" in line
428+
or f"py::bar_fork:{script}" in line
429+
):
430+
self.assertIn(line, child_perf_file_contents)
431+
356432

357433
if __name__ == "__main__":
358434
unittest.main()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Introduced :c:func:`PyUnstable_PerfTrampoline_CompileCode`, :c:func:`PyUnstable_PerfTrampoline_SetPersistAfterFork` and
2+
:c:func:`PyUnstable_CopyPerfMapFile`. These functions allow extension modules to initialize trampolines eagerly, after the application is "warmed up". This makes it possible to have perf-trampolines running in an always-enabled fashion.

Modules/_testinternalcapi.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,36 @@ _testinternalcapi_test_long_numbits_impl(PyObject *module)
15561556
Py_RETURN_NONE;
15571557
}
15581558

1559+
static PyObject *
1560+
compile_perf_trampoline_entry(PyObject *self, PyObject *args)
1561+
{
1562+
PyObject *co;
1563+
if (!PyArg_ParseTuple(args, "O!", &PyCode_Type, &co)) {
1564+
return NULL;
1565+
}
1566+
int ret = PyUnstable_PerfTrampoline_CompileCode((PyCodeObject *)co);
1567+
if (ret != 0) {
1568+
PyErr_SetString(PyExc_AssertionError, "Failed to compile trampoline");
1569+
return NULL;
1570+
}
1571+
return PyLong_FromLong(ret);
1572+
}
1573+
1574+
static PyObject *
1575+
perf_trampoline_set_persist_after_fork(PyObject *self, PyObject *args)
1576+
{
1577+
int enable;
1578+
if (!PyArg_ParseTuple(args, "i", &enable)) {
1579+
return NULL;
1580+
}
1581+
int ret = PyUnstable_PerfTrampoline_SetPersistAfterFork(enable);
1582+
if (ret == 0) {
1583+
PyErr_SetString(PyExc_AssertionError, "Failed to set persist_after_fork");
1584+
return NULL;
1585+
}
1586+
return PyLong_FromLong(ret);
1587+
}
1588+
15591589

15601590
static PyMethodDef module_functions[] = {
15611591
{"get_configs", get_configs, METH_NOARGS},
@@ -1613,6 +1643,8 @@ static PyMethodDef module_functions[] = {
16131643
{"run_in_subinterp_with_config",
16141644
_PyCFunction_CAST(run_in_subinterp_with_config),
16151645
METH_VARARGS | METH_KEYWORDS},
1646+
{"compile_perf_trampoline_entry", compile_perf_trampoline_entry, METH_VARARGS},
1647+
{"perf_trampoline_set_persist_after_fork", perf_trampoline_set_persist_after_fork, METH_VARARGS},
16161648
_TESTINTERNALCAPI_WRITE_UNRAISABLE_EXC_METHODDEF
16171649
_TESTINTERNALCAPI_TEST_LONG_NUMBITS_METHODDEF
16181650
{NULL, NULL} /* sentinel */

Python/perf_trampoline.c

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ typedef struct trampoline_api_st trampoline_api_t;
193193
#define perf_code_arena _PyRuntime.ceval.perf.code_arena
194194
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
195195
#define perf_map_file _PyRuntime.ceval.perf.map_file
196-
196+
#define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
197197

198198
static void
199199
perf_map_write_entry(void *state, const void *code_addr,
@@ -361,6 +361,26 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
361361
}
362362
#endif // PY_HAVE_PERF_TRAMPOLINE
363363

364+
int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
365+
{
366+
#ifdef PY_HAVE_PERF_TRAMPOLINE
367+
py_trampoline f = NULL;
368+
assert(extra_code_index != -1);
369+
int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
370+
if (ret != 0 || f == NULL) {
371+
py_trampoline new_trampoline = compile_trampoline();
372+
if (new_trampoline == NULL) {
373+
return 0;
374+
}
375+
trampoline_api.write_state(trampoline_api.state, new_trampoline,
376+
perf_code_arena->code_size, co);
377+
return _PyCode_SetExtra((PyObject *)co, extra_code_index,
378+
(void *)new_trampoline);
379+
}
380+
#endif // PY_HAVE_PERF_TRAMPOLINE
381+
return 0;
382+
}
383+
364384
int
365385
_PyIsPerfTrampolineActive(void)
366386
{
@@ -448,16 +468,34 @@ _PyPerfTrampoline_Fini(void)
448468
return 0;
449469
}
450470

471+
int
472+
PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
473+
#ifdef PY_HAVE_PERF_TRAMPOLINE
474+
persist_after_fork = enable;
475+
return persist_after_fork;
476+
#endif
477+
return 0;
478+
}
479+
451480
PyStatus
452481
_PyPerfTrampoline_AfterFork_Child(void)
453482
{
454483
#ifdef PY_HAVE_PERF_TRAMPOLINE
455-
// Restart trampoline in file in child.
456-
int was_active = _PyIsPerfTrampolineActive();
457-
_PyPerfTrampoline_Fini();
458484
PyUnstable_PerfMapState_Fini();
459-
if (was_active) {
460-
_PyPerfTrampoline_Init(1);
485+
if (persist_after_fork) {
486+
char filename[256];
487+
pid_t parent_pid = getppid();
488+
snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
489+
if (PyUnstable_CopyPerfMapFile(filename) != 0) {
490+
return PyStatus_Error("Failed to copy perf map file.");
491+
}
492+
} else {
493+
// Restart trampoline in file in child.
494+
int was_active = _PyIsPerfTrampolineActive();
495+
_PyPerfTrampoline_Fini();
496+
if (was_active) {
497+
_PyPerfTrampoline_Init(1);
498+
}
461499
}
462500
#endif
463501
return PyStatus_Ok();

Python/sysmodule.c

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2361,7 +2361,7 @@ PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(
23612361
#ifndef MS_WINDOWS
23622362
if (perf_map_state.perf_map == NULL) {
23632363
int ret = PyUnstable_PerfMapState_Init();
2364-
if(ret != 0){
2364+
if (ret != 0){
23652365
return ret;
23662366
}
23672367
}
@@ -2388,6 +2388,45 @@ PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void) {
23882388
#endif
23892389
}
23902390

2391+
PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename) {
2392+
#ifndef MS_WINDOWS
2393+
FILE* from = fopen(parent_filename, "r");
2394+
if (!from) {
2395+
return -1;
2396+
}
2397+
if (perf_map_state.perf_map == NULL) {
2398+
int ret = PyUnstable_PerfMapState_Init();
2399+
if (ret != 0) {
2400+
return ret;
2401+
}
2402+
}
2403+
char buf[4096];
2404+
PyThread_acquire_lock(perf_map_state.map_lock, 1);
2405+
int fflush_result = 0, result = 0;
2406+
while (1) {
2407+
size_t bytes_read = fread(buf, 1, sizeof(buf), from);
2408+
size_t bytes_written = fwrite(buf, 1, bytes_read, perf_map_state.perf_map);
2409+
fflush_result = fflush(perf_map_state.perf_map);
2410+
if (fflush_result != 0 || bytes_read == 0 || bytes_written < bytes_read) {
2411+
result = -1;
2412+
goto close_and_release;
2413+
}
2414+
if (bytes_read < sizeof(buf) && feof(from)) {
2415+
goto close_and_release;
2416+
}
2417+
}
2418+
close_and_release:
2419+
fclose(from);
2420+
PyThread_release_lock(perf_map_state.map_lock);
2421+
return result;
2422+
#endif
2423+
return 0;
2424+
}
2425+
2426+
#ifdef __cplusplus
2427+
}
2428+
#endif
2429+
23912430

23922431
static PyMethodDef sys_methods[] = {
23932432
/* Might as well keep this in alphabetic order */

0 commit comments

Comments
 (0)