diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 47536108a9665e..505e5476895e80 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -6,9 +6,25 @@ extern "C" { #endif +typedef struct _PyExecutorLinkListNode { + struct _PyExecutorObject *next; + struct _PyExecutorObject *previous; +} _PyExecutorLinkListNode; + +/* Bloom filter with m = 256 */ +#define BLOOM_FILTER_WORDS 8 + +typedef struct _bloom_filter { + uint32_t bits[BLOOM_FILTER_WORDS]; +} _PyBloomFilter; + typedef struct { uint8_t opcode; uint8_t oparg; + uint8_t valid; + uint8_t linked; + _PyBloomFilter bloom; + _PyExecutorLinkListNode links; } _PyVMData; typedef struct _PyExecutorObject { @@ -45,6 +61,11 @@ _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_ extern _PyOptimizerObject _PyOptimizer_Default; +void _Py_ExecutorInit(_PyExecutorObject *); +void _Py_ExecutorClear(_PyExecutorObject *); +PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj); +PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj); + /* For testing */ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewCounter(void); PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 21d1ee38b59e6b..774ca546b5b1ac 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -187,6 +187,7 @@ struct _is { struct types_state types; struct callable_cache callable_cache; _PyOptimizerObject *optimizer; + _PyExecutorObject *executor_list_head; uint16_t optimizer_resume_threshold; uint16_t optimizer_backedge_threshold; uint32_t next_func_version; diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 8ef398c5db09f6..09fa1090a5c25f 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -89,6 +89,7 @@ #define _JUMP_TO_TOP 361 #define _SAVE_CURRENT_IP 362 #define _INSERT 363 +#define _JUMP_IF_INVALID 364 extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); #ifdef NEED_OPCODE_METADATA @@ -660,6 +661,8 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return 0; case _INSERT: return oparg + 1; + case _JUMP_IF_INVALID: + return 0; default: return -1; } @@ -1236,6 +1239,8 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 0; case _INSERT: return oparg + 1; + case _JUMP_IF_INVALID: + return 0; default: return -1; } @@ -1592,6 +1597,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [_SAVE_CURRENT_IP] = { true, INSTR_FMT_IX, 0 }, [_EXIT_TRACE] = { true, INSTR_FMT_IX, 0 }, [_INSERT] = { true, INSTR_FMT_IB, HAS_ARG_FLAG }, + [_JUMP_IF_INVALID] = { true, INSTR_FMT_IB, HAS_ARG_FLAG }, }; #endif // NEED_OPCODE_METADATA @@ -1814,6 +1820,7 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_JUMP_TO_TOP] = "_JUMP_TO_TOP", [_SAVE_CURRENT_IP] = "_SAVE_CURRENT_IP", [_INSERT] = "_INSERT", + [_JUMP_IF_INVALID] = "_JUMP_IF_INVALID", }; #endif // NEED_OPCODE_METADATA diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 5ece213e7b2363..cdb251e7a64c10 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2387,6 +2387,50 @@ def testfunc(x): self.assertEqual(code, replace_code) self.assertEqual(hash(code), hash(replace_code)) +class TestExecutorInvalidation(unittest.TestCase): + + def setUp(self): + self.old = _testinternalcapi.get_optimizer() + self.opt = _testinternalcapi.get_counter_optimizer() + _testinternalcapi.set_optimizer(self.opt) + + def tearDown(self): + _testinternalcapi.set_optimizer(self.old) + + def test_invalidate_object(self): + # Generate a new set of functions at each call + ns = {} + func_src = "\n".join( + f""" + def f{n}(): + for _ in range(1000): + pass + """ for n in range(5) + ) + exec(textwrap.dedent(func_src), ns, ns) + funcs = [ ns[f'f{n}'] for n in range(5)] + objects = [object() for _ in range(5)] + + for f in funcs: + f() + executors = [ get_first_executor(f) for f in funcs] + # Set things up so each executor depends on the objects + # with an equal or lower index. + for i, exe in enumerate(executors): + self.assertTrue(exe.valid) + for obj in objects[:i+1]: + _testinternalcapi.add_executor_dependency(exe, obj) + self.assertTrue(exe.valid) + # Assert that the correct executors are invalidated + # and check that nothing crashes when we invalidate + # an executor mutliple times. + for i in (4,3,2,1,0): + _testinternalcapi.invalidate_executors(objects[i]) + for exe in executors[i:]: + self.assertFalse(exe.valid) + for exe in executors[:i]: + self.assertTrue(exe.valid) + def get_first_executor(func): code = func.__code__ diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 05bac0936b155d..a977b4ecbda88a 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1002,6 +1002,32 @@ get_executor(PyObject *self, PyObject *const *args, Py_ssize_t nargs) return (PyObject *)PyUnstable_GetExecutor((PyCodeObject *)code, ioffset); } +static PyObject * +add_executor_dependency(PyObject *self, PyObject *args) +{ + PyObject *exec; + PyObject *obj; + if (!PyArg_ParseTuple(args, "OO", &exec, &obj)) { + return NULL; + } + /* No way to tell in general if exec is an executor, so we only accept + * counting_executor */ + if (strcmp(Py_TYPE(exec)->tp_name, "counting_executor")) { + PyErr_SetString(PyExc_TypeError, "argument must be a counting_executor"); + return NULL; + } + _Py_Executor_DependsOn((_PyExecutorObject *)exec, obj); + Py_RETURN_NONE; +} + +static PyObject * +invalidate_executors(PyObject *self, PyObject *obj) +{ + PyInterpreterState *interp = PyInterpreterState_Get(); + _Py_Executors_InvalidateDependency(interp, obj); + Py_RETURN_NONE; +} + static int _pending_callback(void *arg) { /* we assume the argument is callable object to which we own a reference */ @@ -1565,6 +1591,8 @@ static PyMethodDef module_functions[] = { {"get_executor", _PyCFunction_CAST(get_executor), METH_FASTCALL, NULL}, {"get_counter_optimizer", get_counter_optimizer, METH_NOARGS, NULL}, {"get_uop_optimizer", get_uop_optimizer, METH_NOARGS, NULL}, + {"add_executor_dependency", add_executor_dependency, METH_VARARGS, NULL}, + {"invalidate_executors", invalidate_executors, METH_O, NULL}, {"pending_threadfunc", _PyCFunction_CAST(pending_threadfunc), METH_VARARGS | METH_KEYWORDS}, {"pending_identify", pending_identify, METH_VARARGS, NULL}, diff --git a/Objects/codeobject.c b/Objects/codeobject.c index f662b8e354bb1e..9e5e84b2d8d2f0 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -63,6 +63,8 @@ notify_code_watchers(PyCodeEvent event, PyCodeObject *co) } } + + int PyCode_AddWatcher(PyCode_WatchCallback callback) { diff --git a/Python/abstract_interp_cases.c.h b/Python/abstract_interp_cases.c.h index 04fe07fad39937..b7eb0b2c49c16d 100644 --- a/Python/abstract_interp_cases.c.h +++ b/Python/abstract_interp_cases.c.h @@ -932,3 +932,7 @@ PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1 - oparg)), true); break; } + + case _JUMP_IF_INVALID: { + break; + } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6482252eec9fb9..f164a3db806e87 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3973,6 +3973,12 @@ dummy_func( memmove(&stack_pointer[-1 - oparg], &stack_pointer[-oparg], oparg * sizeof(stack_pointer[0])); } + op(_JUMP_IF_INVALID, (--)) { + if (!self->base.vm_data.valid) { + pc = oparg; + } + } + // END BYTECODES // diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index c6eeee4925ef1f..d883b34801833a 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3301,3 +3301,10 @@ stack_pointer[-1 - oparg] = top; break; } + + case _JUMP_IF_INVALID: { + if (!self->base.vm_data.valid) { + pc = oparg; + } + break; + } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index eee1908e503e43..c62803c9708aba 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1582,6 +1582,7 @@ _Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) if (code->co_executors != NULL) { _PyCode_Clear_Executors(code); } + _Py_Executors_InvalidateDependency(interp, code); int code_len = (int)Py_SIZE(code); /* code->_co_firsttraceable >= code_len indicates * that no instrumentation can be inserted. diff --git a/Python/optimizer.c b/Python/optimizer.c index fbdbf7291784c4..a095d8d341ad12 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -220,10 +220,16 @@ typedef struct { static void counter_dealloc(_PyCounterExecutorObject *self) { + _Py_ExecutorClear((_PyExecutorObject *)self); Py_DECREF(self->optimizer); PyObject_Free(self); } +static PyMemberDef counter_members[] = { + { "valid", Py_T_UBYTE, offsetof(_PyCounterExecutorObject, executor.vm_data.valid), Py_READONLY, "is valid?" }, + { NULL } +}; + static PyTypeObject CounterExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "counting_executor", @@ -231,6 +237,7 @@ static PyTypeObject CounterExecutor_Type = { .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)counter_dealloc, + .tp_members = counter_members, }; static _PyInterpreterFrame * @@ -261,6 +268,7 @@ counter_optimize( executor->optimizer = (_PyCounterOptimizerObject *)self; executor->next_instr = instr; *exec_ptr = (_PyExecutorObject *)executor; + _Py_ExecutorInit((_PyExecutorObject *)executor); return 1; } @@ -288,6 +296,8 @@ static PyTypeObject CounterOptimizer_Type = { PyObject * PyUnstable_Optimizer_NewCounter(void) { + PyType_Ready(&CounterExecutor_Type); + PyType_Ready(&CounterOptimizer_Type); _PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&CounterOptimizer_Type); if (opt == NULL) { return NULL; @@ -303,6 +313,7 @@ PyUnstable_Optimizer_NewCounter(void) static void uop_dealloc(_PyUOpExecutorObject *self) { + _Py_ExecutorClear((_PyExecutorObject *)self); PyObject_Free(self); } @@ -903,6 +914,7 @@ uop_optimize( } executor->base.execute = _PyUopExecute; memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction)); + _Py_ExecutorInit((_PyExecutorObject *)executor); *exec_ptr = (_PyExecutorObject *)executor; return 1; } @@ -935,3 +947,208 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; return (PyObject *)opt; } + + +/***************************************** + * Executor management + ****************************************/ + +/* We use a bloomfilter with k = 6, m = 256 + * The choice of k and the following constants + * could do with a more rigourous analysis, + * but here is a simple analysis: + * + * We want to keep the false positive rate low. + * For n = 5 (a trace depends on 5 objects), + * we expect 30 bits set, giving a false positive + * rate of (30/256)**6 == 2.5e-6 which is plenty + * good enough. + * + * However with n = 10 we expect 60 bits set (worst case), + * giving a false positive of (60/256)**6 == 0.0001 + * + * We choose k = 6, rather than a higher number as + * it means the false positive rate goes slower for high n. + * + * n = 15, k = 6 => fp = 0.18% + * n = 15, k = 8 => fp = 0.23% + * n = 20, k = 6 => fp = 1.1% + * n = 20, k = 8 => fp = 2.3% + * + * The above analysis assumes perfect hash functions, + * but we don't have those, so it is approximate. + */ + +#define K 6 + +static uint32_t +address_to_hash(void *ptr, uint32_t seed, uint32_t multiplier) { + assert(ptr != NULL); + uint32_t uhash = seed; + uintptr_t addr = (uintptr_t)ptr; + for (int i = 0; i < SIZEOF_VOID_P; i++) { + uhash ^= addr & 255; + uhash *= multiplier; + addr >>= 8; + } + return uhash; +} + +static const uint32_t multipliers[2] = { + _PyHASH_MULTIPLIER, + 110351524, +}; + +static const uint32_t seeds[2] = { + 20221211, + 2147483647, +}; + +static void +address_to_hashes(void *ptr, _PyBloomFilter *bloom) +{ + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + bloom->bits[i] = 0; + } + /* Set k (6) bits */ + for (int i = 0; i < 2; i++) { + uint32_t hash = address_to_hash(ptr, seeds[i], multipliers[i]); + for (int j = 0; j < (K/2); j++) { + uint8_t bits = hash & 255; + bloom->bits[bits >> 5] |= (1 << (bits&31)); + hash >>= 8; + } + } +} + +static void +add_to_bloom_filter(_PyBloomFilter *bloom, PyObject *obj) +{ + _PyBloomFilter hashes; + address_to_hashes(obj, &hashes); + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + bloom->bits[i] |= hashes.bits[i]; + } +} + +static bool +bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) +{ + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) { + return false; + } + } + return true; +} + +static void +link_executor(_PyExecutorObject *executor) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyExecutorLinkListNode *links = &executor->vm_data.links; + _PyExecutorObject *head = interp->executor_list_head; + if (head == NULL) { + interp->executor_list_head = executor; + links->previous = NULL; + links->next = NULL; + } + else { + _PyExecutorObject *next = head->vm_data.links.next; + links->previous = head; + links->next = next; + if (next != NULL) { + next->vm_data.links.previous = executor; + } + head->vm_data.links.next = executor; + } + executor->vm_data.linked = true; +} + +static void +unlink_executor(_PyExecutorObject *executor) +{ + if (!executor->vm_data.linked) { + return; + } + _PyExecutorLinkListNode *links = &executor->vm_data.links; + _PyExecutorObject *next = links->next; + _PyExecutorObject *prev = links->previous; + if (next != NULL) { + next->vm_data.links.previous = prev; + } + else if (prev == NULL) { + // Both are NULL, so this must be the list head. + PyInterpreterState *interp = PyInterpreterState_Get(); + assert(interp->executor_list_head == executor); + interp->executor_list_head = executor->vm_data.links.next; + } + if (prev != NULL) { + prev->vm_data.links.next = next; + } + executor->vm_data.linked = false; +} + +/* This must be called by optimizers before using the executor */ +void +_Py_ExecutorInit(_PyExecutorObject *executor) +{ + executor->vm_data.valid = true; + for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { + executor->vm_data.bloom.bits[i] = 0; + } + link_executor(executor); +} + +/* This must be called by executors during dealloc */ +void +_Py_ExecutorClear(_PyExecutorObject *executor) +{ + unlink_executor(executor); +} + +void +_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) +{ + assert(executor->vm_data.valid = true); + add_to_bloom_filter(&executor->vm_data.bloom, obj); +} + +/* Invalidate all executors that depend on `obj` + * May cause other executors to be invalidated as well + */ +void +_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj) +{ + _PyBloomFilter hashes; + address_to_hashes(obj, &hashes); + /* Walk the list of executors */ + /* TO DO -- Use a tree to avoid traversing as many objects */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + if (bloom_filter_may_contain(&exec->vm_data.bloom, &hashes)) { + exec->vm_data.valid = false; + unlink_executor(exec); + } + exec = next; + } +} + +/* Invalidate all executors + */ +void +_Py_Executors_InvalidateAll(PyInterpreterState *interp) +{ + /* Walk the list of executors */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + exec->vm_data.links.next = NULL; + exec->vm_data.links.previous = NULL; + exec->vm_data.valid = false; + exec->vm_data.linked = false; + exec = next; + } + interp->executor_list_head = NULL; +} diff --git a/Python/pystate.c b/Python/pystate.c index ae33259f8df2f6..836bf07912c7e5 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -704,6 +704,7 @@ init_interpreter(PyInterpreterState *interp, interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold; interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold; interp->next_func_version = 1; + interp->executor_list_head = NULL; if (interp != &runtime->_main_interpreter) { /* Fix the self-referential, statically initialized fields. */ interp->dtoa = (struct _dtoa_state)_dtoa_state_INIT(interp);