From 1e6df25b5f4fa34a27123227fc07359c42f3c230 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Sat, 13 Jul 2024 00:00:32 -0700 Subject: [PATCH 01/11] Don't require progress on side exits --- Include/internal/pycore_optimizer.h | 4 +-- Python/bytecodes.c | 6 ++-- Python/executor_cases.c.h | 4 +-- Python/generated_cases.c.h | 2 +- Python/optimizer.c | 50 ++++++++++++++++------------- 5 files changed, 35 insertions(+), 31 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index b6da27c067727f..a30f6ef9a86666 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -83,7 +83,7 @@ typedef struct _PyOptimizerObject _PyOptimizerObject; typedef int (*_Py_optimize_func)( _PyOptimizerObject* self, struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, - int curr_stackentries); + int curr_stackentries, bool progress_needed); struct _PyOptimizerObject { PyObject_HEAD @@ -257,7 +257,7 @@ extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx); PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored); -PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr); +PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, bool progress_needed); static inline int is_terminator(const _PyUOpInstruction *uop) { diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 9a1af0e920188b..40a6797819a8f6 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2508,7 +2508,7 @@ dummy_func( start--; } _PyExecutorObject *executor; - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor); + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, true); ERROR_IF(optimized < 0, error); if (optimized) { assert(tstate->previous_executor == NULL); @@ -4547,7 +4547,7 @@ dummy_func( Py_INCREF(executor); } else { - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); if (optimized <= 0) { exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { @@ -4630,7 +4630,7 @@ dummy_func( exit->temperature = advance_backoff_counter(exit->temperature); GOTO_TIER_ONE(target); } - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); if (optimized <= 0) { exit->temperature = restart_backoff_counter(exit->temperature); if (optimized < 0) { diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index afc7786c9e434d..e593c837af4791 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -5023,7 +5023,7 @@ Py_INCREF(executor); } else { - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); if (optimized <= 0) { exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { @@ -5155,7 +5155,7 @@ exit->temperature = advance_backoff_counter(exit->temperature); GOTO_TIER_ONE(target); } - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); if (optimized <= 0) { exit->temperature = restart_backoff_counter(exit->temperature); if (optimized < 0) { diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index f670353cdbde56..73989de21d85a7 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4272,7 +4272,7 @@ start--; } _PyExecutorObject *executor; - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor); + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, true); if (optimized < 0) goto error; if (optimized) { assert(tstate->previous_executor == NULL); diff --git a/Python/optimizer.c b/Python/optimizer.c index e9cbfc5497189c..1a3c7bb4f1f995 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -111,7 +111,7 @@ never_optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec, - int Py_UNUSED(stack_entries)) + int Py_UNUSED(stack_entries), bool Py_UNUSED(progress_needed)) { // This may be called if the optimizer is reset return 0; @@ -176,32 +176,37 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer) int _PyOptimizer_Optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *start, - _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr) + _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, bool progress_needed) { PyCodeObject *code = _PyFrame_GetCode(frame); assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); - if (!has_space_for_executor(code, start)) { + if (progress_needed && !has_space_for_executor(code, start)) { return 0; } _PyOptimizerObject *opt = interp->optimizer; - int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame))); + int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)), progress_needed); if (err <= 0) { return err; } assert(*executor_ptr != NULL); - int index = get_index_for_executor(code, start); - if (index < 0) { - /* Out of memory. Don't raise and assume that the - * error will show up elsewhere. - * - * If an optimizer has already produced an executor, - * it might get confused by the executor disappearing, - * but there is not much we can do about that here. */ - Py_DECREF(*executor_ptr); - return 0; + if (progress_needed) { + int index = get_index_for_executor(code, start); + if (index < 0) { + /* Out of memory. Don't raise and assume that the + * error will show up elsewhere. + * + * If an optimizer has already produced an executor, + * it might get confused by the executor disappearing, + * but there is not much we can do about that here. */ + Py_DECREF(*executor_ptr); + return 0; + } + insert_executor(code, start, index, *executor_ptr); + } + else { + (*executor_ptr)->vm_data.code = NULL; } - insert_executor(code, start, index, *executor_ptr); assert((*executor_ptr)->vm_data.valid); return 1; } @@ -530,9 +535,8 @@ translate_bytecode_to_trace( _Py_CODEUNIT *instr, _PyUOpInstruction *trace, int buffer_size, - _PyBloomFilter *dependencies) + _PyBloomFilter *dependencies, bool progress_needed) { - bool progress_needed = true; PyCodeObject *code = _PyFrame_GetCode(frame); PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; assert(PyFunction_Check(func)); @@ -567,6 +571,7 @@ translate_bytecode_to_trace( 2 * INSTR_IP(initial_instr, code)); ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code)); uint32_t target = 0; + bool first = true; for (;;) { target = INSTR_IP(instr, code); @@ -576,7 +581,7 @@ translate_bytecode_to_trace( uint32_t opcode = instr->op.code; uint32_t oparg = instr->op.arg; - if (!progress_needed && instr == initial_instr) { + if (!first && instr == initial_instr) { // We have looped around to the start: RESERVE(1); ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); @@ -606,8 +611,6 @@ translate_bytecode_to_trace( RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP"); ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target); - /* Special case the first instruction, - * so that we can guarantee forward progress */ if (progress_needed) { if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) { opcode = _PyOpcode_Deopt[opcode]; @@ -904,6 +907,7 @@ translate_bytecode_to_trace( top: // Jump here after _PUSH_FRAME or likely branches. progress_needed = false; + first = false; } // End for (;;) done: @@ -1225,13 +1229,13 @@ uop_optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, - int curr_stackentries) + int curr_stackentries, bool progress_needed) { _PyBloomFilter dependencies; _Py_BloomFilter_Init(&dependencies); _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH]; OPT_STAT_INC(attempts); - int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); + int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies, progress_needed); if (length <= 0) { // Error or nothing translated return length; @@ -1328,7 +1332,7 @@ counter_optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, - int Py_UNUSED(curr_stackentries) + int Py_UNUSED(curr_stackentries), bool Py_UNUSED(progress_needed) ) { PyCodeObject *code = _PyFrame_GetCode(frame); From c88b70db1ef5406b2324e4d82d0dd61f4c21c68d Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Sat, 13 Jul 2024 10:31:36 -0700 Subject: [PATCH 02/11] Add an upper bound on side exit chaining before progress is required again --- Include/internal/pycore_optimizer.h | 1 + Python/bytecodes.c | 8 ++++++-- Python/executor_cases.c.h | 7 +++++-- Python/generated_cases.c.h | 1 + 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index a30f6ef9a86666..c513fd6cba401a 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -35,6 +35,7 @@ typedef struct { _PyBloomFilter bloom; _PyExecutorLinkListNode links; PyCodeObject *code; // Weak (NULL if no corresponding ENTER_EXECUTOR). + int depth; } _PyVMData; /* Depending on the format, diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 40a6797819a8f6..474ed1041dbc8f 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2511,6 +2511,7 @@ dummy_func( int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, true); ERROR_IF(optimized < 0, error); if (optimized) { + executor->vm_data.depth = 0; assert(tstate->previous_executor == NULL); tstate->previous_executor = Py_None; GOTO_TIER_TWO(executor); @@ -4547,7 +4548,8 @@ dummy_func( Py_INCREF(executor); } else { - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); + int new_depth = (current_executor->vm_data.depth + 1) % 4; + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, new_depth == 0); if (optimized <= 0) { exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { @@ -4558,6 +4560,7 @@ dummy_func( tstate->previous_executor = (PyObject *)current_executor; GOTO_TIER_ONE(target); } + executor->vm_data.depth = new_depth; } exit->executor = executor; } @@ -4630,7 +4633,7 @@ dummy_func( exit->temperature = advance_backoff_counter(exit->temperature); GOTO_TIER_ONE(target); } - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, true); if (optimized <= 0) { exit->temperature = restart_backoff_counter(exit->temperature); if (optimized < 0) { @@ -4641,6 +4644,7 @@ dummy_func( GOTO_TIER_ONE(target); } else { + executor->vm_data.depth = 0; exit->temperature = initial_temperature_backoff_counter(); } } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index e593c837af4791..4cff908e3b2bdc 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -5023,7 +5023,8 @@ Py_INCREF(executor); } else { - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); + int new_depth = (current_executor->vm_data.depth + 1) % 4; + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, new_depth == 0); if (optimized <= 0) { exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { @@ -5034,6 +5035,7 @@ tstate->previous_executor = (PyObject *)current_executor; GOTO_TIER_ONE(target); } + executor->vm_data.depth = new_depth; } exit->executor = executor; } @@ -5155,7 +5157,7 @@ exit->temperature = advance_backoff_counter(exit->temperature); GOTO_TIER_ONE(target); } - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, false); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, true); if (optimized <= 0) { exit->temperature = restart_backoff_counter(exit->temperature); if (optimized < 0) { @@ -5166,6 +5168,7 @@ GOTO_TIER_ONE(target); } else { + executor->vm_data.depth = 0; exit->temperature = initial_temperature_backoff_counter(); } } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 73989de21d85a7..d7be223dfd09f4 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4275,6 +4275,7 @@ int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, true); if (optimized < 0) goto error; if (optimized) { + executor->vm_data.depth = 0; assert(tstate->previous_executor == NULL); tstate->previous_executor = Py_None; GOTO_TIER_TWO(executor); From 5165880bca88683bd5a29e22e23e13b24aedb5a5 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 18 Jul 2024 16:35:57 -0700 Subject: [PATCH 03/11] Clean things up --- Include/internal/pycore_optimizer.h | 10 ++++++++-- Python/bytecodes.c | 11 ++++------- Python/executor_cases.c.h | 8 +++----- Python/generated_cases.c.h | 3 +-- Python/optimizer.c | 9 ++++++++- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index c513fd6cba401a..e3b8333a624107 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -35,7 +35,7 @@ typedef struct { _PyBloomFilter bloom; _PyExecutorLinkListNode links; PyCodeObject *code; // Weak (NULL if no corresponding ENTER_EXECUTOR). - int depth; + int chain_depth; } _PyVMData; /* Depending on the format, @@ -183,6 +183,12 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst) // Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH()) #define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2) +// The number of traces that will be stitched together via side exits for a +// single instruction before requiring progress. In practice, this is the number +// of different classes/functions/etc. that tier two can handle for a single +// tier one instruction. +#define MAX_CHAIN_DEPTH 4 + typedef struct _Py_UopsSymbol _Py_UopsSymbol; struct _Py_UOpsAbstractFrame { @@ -258,7 +264,7 @@ extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx); PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored); -PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, bool progress_needed); +PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, int chain_depth); static inline int is_terminator(const _PyUOpInstruction *uop) { diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 474ed1041dbc8f..0872c31eefde13 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2508,10 +2508,9 @@ dummy_func( start--; } _PyExecutorObject *executor; - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, true); + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0); ERROR_IF(optimized < 0, error); if (optimized) { - executor->vm_data.depth = 0; assert(tstate->previous_executor == NULL); tstate->previous_executor = Py_None; GOTO_TIER_TWO(executor); @@ -4548,8 +4547,8 @@ dummy_func( Py_INCREF(executor); } else { - int new_depth = (current_executor->vm_data.depth + 1) % 4; - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, new_depth == 0); + int chain_depth = current_executor->vm_data.chain_depth + 1; + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth); if (optimized <= 0) { exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { @@ -4560,7 +4559,6 @@ dummy_func( tstate->previous_executor = (PyObject *)current_executor; GOTO_TIER_ONE(target); } - executor->vm_data.depth = new_depth; } exit->executor = executor; } @@ -4633,7 +4631,7 @@ dummy_func( exit->temperature = advance_backoff_counter(exit->temperature); GOTO_TIER_ONE(target); } - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, true); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0); if (optimized <= 0) { exit->temperature = restart_backoff_counter(exit->temperature); if (optimized < 0) { @@ -4644,7 +4642,6 @@ dummy_func( GOTO_TIER_ONE(target); } else { - executor->vm_data.depth = 0; exit->temperature = initial_temperature_backoff_counter(); } } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 4cff908e3b2bdc..c81b334f28e67f 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -5023,8 +5023,8 @@ Py_INCREF(executor); } else { - int new_depth = (current_executor->vm_data.depth + 1) % 4; - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, new_depth == 0); + int chain_depth = current_executor->vm_data.chain_depth + 1; + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth); if (optimized <= 0) { exit->temperature = restart_backoff_counter(temperature); if (optimized < 0) { @@ -5035,7 +5035,6 @@ tstate->previous_executor = (PyObject *)current_executor; GOTO_TIER_ONE(target); } - executor->vm_data.depth = new_depth; } exit->executor = executor; } @@ -5157,7 +5156,7 @@ exit->temperature = advance_backoff_counter(exit->temperature); GOTO_TIER_ONE(target); } - int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, true); + int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0); if (optimized <= 0) { exit->temperature = restart_backoff_counter(exit->temperature); if (optimized < 0) { @@ -5168,7 +5167,6 @@ GOTO_TIER_ONE(target); } else { - executor->vm_data.depth = 0; exit->temperature = initial_temperature_backoff_counter(); } } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index d7be223dfd09f4..b687380d67a318 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4272,10 +4272,9 @@ start--; } _PyExecutorObject *executor; - int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, true); + int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0); if (optimized < 0) goto error; if (optimized) { - executor->vm_data.depth = 0; assert(tstate->previous_executor == NULL); tstate->previous_executor = Py_None; GOTO_TIER_TWO(executor); diff --git a/Python/optimizer.c b/Python/optimizer.c index 1a3c7bb4f1f995..36581a2cfeb01b 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -176,8 +176,14 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer) int _PyOptimizer_Optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *start, - _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, bool progress_needed) + _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth) { + // The first instruction in a chain and the MAX_CHAIN_DEPTH'th instruction + // *must* make progress in order to avoid infinite loops or excessively-long + // side-exit chains. We can only insert the executor into the bytecode if + // this is true, since a deopt won't infinitely re-enter the executor: + chain_depth %= MAX_CHAIN_DEPTH; + bool progress_needed = chain_depth == 0; PyCodeObject *code = _PyFrame_GetCode(frame); assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); @@ -207,6 +213,7 @@ _PyOptimizer_Optimize( else { (*executor_ptr)->vm_data.code = NULL; } + (*executor_ptr)->vm_data.chain_depth = chain_depth; assert((*executor_ptr)->vm_data.valid); return 1; } From 63b52076b9636908a2cc4543395a3917a3263989 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 29 Jul 2024 21:06:31 -0700 Subject: [PATCH 04/11] We still need to translate an instruction, even if progress isn't needed --- Python/optimizer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 36581a2cfeb01b..bcf76ec457f63b 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -618,7 +618,8 @@ translate_bytecode_to_trace( RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP"); ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target); - if (progress_needed) { + if (first && progress_needed) { + assert(first); if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) { opcode = _PyOpcode_Deopt[opcode]; } @@ -913,7 +914,6 @@ translate_bytecode_to_trace( } top: // Jump here after _PUSH_FRAME or likely branches. - progress_needed = false; first = false; } // End for (;;) @@ -923,7 +923,7 @@ translate_bytecode_to_trace( } assert(code == initial_code); // Skip short traces where we can't even translate a single instruction: - if (progress_needed) { + if (first) { OPT_STAT_INC(trace_too_short); DPRINTF(2, "No trace for %s (%s:%d) at byte offset %d (no progress)\n", From 17a27d065f53ff07912fb4456aa99d53f76c5889 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 8 Aug 2024 00:41:25 -0700 Subject: [PATCH 05/11] Stitch to ENTER_EXECUTOR during projection --- Python/optimizer.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index bcf76ec457f63b..f723fe9e8567ba 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -597,14 +597,6 @@ translate_bytecode_to_trace( DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg); - if (opcode == ENTER_EXECUTOR) { - assert(oparg < 256); - _PyExecutorObject *executor = code->co_executors->executors[oparg]; - opcode = executor->vm_data.opcode; - DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]); - oparg = executor->vm_data.oparg; - } - if (opcode == EXTENDED_ARG) { instr++; opcode = instr->op.code; @@ -614,6 +606,10 @@ translate_bytecode_to_trace( goto done; } } + + if (opcode == ENTER_EXECUTOR) { + goto done; + } assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG); RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP"); ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target); From 133aa69f6dc8433b9387f7b426ae8924e36f7afd Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 8 Aug 2024 16:00:25 -0700 Subject: [PATCH 06/11] Clean up the diff --- Include/internal/pycore_optimizer.h | 8 ++++---- Python/optimizer.c | 13 +++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index e3b8333a624107..c0984f9a1977b3 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -183,10 +183,10 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst) // Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH()) #define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2) -// The number of traces that will be stitched together via side exits for a -// single instruction before requiring progress. In practice, this is the number -// of different classes/functions/etc. that tier two can handle for a single -// tier one instruction. +// The maximum number of side exits that we can take before requiring forward +// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this +// is the "maximum amount of polymorphism" that an isolated trace tree can +// handle before rejoining the rest of the program. #define MAX_CHAIN_DEPTH 4 typedef struct _Py_UopsSymbol _Py_UopsSymbol; diff --git a/Python/optimizer.c b/Python/optimizer.c index f723fe9e8567ba..0375e5d65ad414 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -200,11 +200,11 @@ _PyOptimizer_Optimize( int index = get_index_for_executor(code, start); if (index < 0) { /* Out of memory. Don't raise and assume that the - * error will show up elsewhere. - * - * If an optimizer has already produced an executor, - * it might get confused by the executor disappearing, - * but there is not much we can do about that here. */ + * error will show up elsewhere. + * + * If an optimizer has already produced an executor, + * it might get confused by the executor disappearing, + * but there is not much we can do about that here. */ Py_DECREF(*executor_ptr); return 0; } @@ -588,6 +588,8 @@ translate_bytecode_to_trace( uint32_t opcode = instr->op.code; uint32_t oparg = instr->op.arg; + /* Special case the first instruction, + * so that we can guarantee forward progress */ if (!first && instr == initial_instr) { // We have looped around to the start: RESERVE(1); @@ -606,7 +608,6 @@ translate_bytecode_to_trace( goto done; } } - if (opcode == ENTER_EXECUTOR) { goto done; } From a5bdc9fac9de2c99f3d5c5a5af0bfe94c8ad2b91 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 8 Aug 2024 16:02:37 -0700 Subject: [PATCH 07/11] blurb add --- .../2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst new file mode 100644 index 00000000000000..dacc7275d3d6dd --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst @@ -0,0 +1 @@ +Improve the experimental JIT's handling of polymorphic code. From 94c930856a4790af96408bd169d61c18b60d3cb7 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 8 Aug 2024 16:04:51 -0700 Subject: [PATCH 08/11] fixup --- Python/optimizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 0375e5d65ad414..e40c375026eacd 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -588,8 +588,6 @@ translate_bytecode_to_trace( uint32_t opcode = instr->op.code; uint32_t oparg = instr->op.arg; - /* Special case the first instruction, - * so that we can guarantee forward progress */ if (!first && instr == initial_instr) { // We have looped around to the start: RESERVE(1); @@ -615,6 +613,8 @@ translate_bytecode_to_trace( RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP"); ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target); + /* Special case the first instruction, + * so that we can guarantee forward progress */ if (first && progress_needed) { assert(first); if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) { From 037b3fe8ee4770099e369d3dd699947e9b3567af Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 8 Aug 2024 16:21:44 -0700 Subject: [PATCH 09/11] Clean up the diff --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index e40c375026eacd..766b517ba9ca5f 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -544,6 +544,7 @@ translate_bytecode_to_trace( int buffer_size, _PyBloomFilter *dependencies, bool progress_needed) { + bool first = true; PyCodeObject *code = _PyFrame_GetCode(frame); PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj; assert(PyFunction_Check(func)); @@ -578,7 +579,6 @@ translate_bytecode_to_trace( 2 * INSTR_IP(initial_instr, code)); ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code)); uint32_t target = 0; - bool first = true; for (;;) { target = INSTR_IP(instr, code); From 45c34a3a1abcd114f955adcf53a47fed7808eac2 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 12 Aug 2024 11:12:03 -0700 Subject: [PATCH 10/11] Pack some small _PyVMData members into a bitfield --- Include/internal/pycore_optimizer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index c0984f9a1977b3..19e54bf122a8bb 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -29,13 +29,13 @@ typedef struct { typedef struct { uint8_t opcode; uint8_t oparg; - uint8_t valid; - uint8_t linked; + uint16_t valid:1; + uint16_t linked:1; + uint16_t chain_depth:14; // Must be big engough for MAX_CHAIN_DEPTH - 1. int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below). _PyBloomFilter bloom; _PyExecutorLinkListNode links; PyCodeObject *code; // Weak (NULL if no corresponding ENTER_EXECUTOR). - int chain_depth; } _PyVMData; /* Depending on the format, From 167df2996d91359795cf7ed1b6eb96d9f1567e3f Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 12 Aug 2024 11:12:25 -0700 Subject: [PATCH 11/11] Add clarifying comments and fix style --- Python/optimizer.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 766b517ba9ca5f..1758329c832a88 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -111,7 +111,8 @@ never_optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec, - int Py_UNUSED(stack_entries), bool Py_UNUSED(progress_needed)) + int Py_UNUSED(stack_entries), + bool Py_UNUSED(progress_needed)) { // This may be called if the optimizer is reset return 0; @@ -178,8 +179,8 @@ _PyOptimizer_Optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth) { - // The first instruction in a chain and the MAX_CHAIN_DEPTH'th instruction - // *must* make progress in order to avoid infinite loops or excessively-long + // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must* + // make progress in order to avoid infinite loops or excessively-long // side-exit chains. We can only insert the executor into the bytecode if // this is true, since a deopt won't infinitely re-enter the executor: chain_depth %= MAX_CHAIN_DEPTH; @@ -607,6 +608,16 @@ translate_bytecode_to_trace( } } if (opcode == ENTER_EXECUTOR) { + // We have a couple of options here. We *could* peek "underneath" + // this executor and continue tracing, which could give us a longer, + // more optimizeable trace (at the expense of lots of duplicated + // tier two code). Instead, we choose to just end here and stitch to + // the other trace, which allows a side-exit traces to rejoin the + // "main" trace periodically (and also helps protect us against + // pathological behavior where the amount of tier two code explodes + // for a medium-length, branchy code path). This seems to work + // better in practice, but in the future we could be smarter about + // what we do here: goto done; } assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG); @@ -1233,7 +1244,8 @@ uop_optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, - int curr_stackentries, bool progress_needed) + int curr_stackentries, + bool progress_needed) { _PyBloomFilter dependencies; _Py_BloomFilter_Init(&dependencies); @@ -1336,7 +1348,8 @@ counter_optimize( _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr, - int Py_UNUSED(curr_stackentries), bool Py_UNUSED(progress_needed) + int Py_UNUSED(curr_stackentries), + bool Py_UNUSED(progress_needed) ) { PyCodeObject *code = _PyFrame_GetCode(frame);