From 56133bb0184e5d2b8bf5809b648aa512693a0a02 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 5 Aug 2023 12:13:46 -0700 Subject: [PATCH 01/34] Split `CALL_PY_EXACT_ARGS` into uops This is only the first step for doing `CALL` in Tier 2. The next step involves tracing into the called code object. After that we'll have to do the remaining `CALL` specialization. Finally we'll have to tweak various things like `KW_NAMES`, and possibly move the `NULL` (for method calls) *above* the callable. But those are things for future PRs. Note: this moves setting `frame->return_offset` directly in front of `DISPATCH_INLINED()`, to make it easier to move it into `_PUSH_FRAME`. --- Include/internal/pycore_opcode_metadata.h | 15 +++- Python/bytecodes.c | 35 +++++++--- Python/executor.c | 13 ++++ Python/executor_cases.c.h | 61 ++++++++++++++++ Python/generated_cases.c.h | 85 ++++++++++++++--------- Python/optimizer.c | 9 +++ Tools/cases_generator/generate_cases.py | 16 ++++- Tools/cases_generator/instructions.py | 20 +++--- Tools/cases_generator/stacking.py | 26 +++++-- 9 files changed, 220 insertions(+), 60 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 9f4437c09e92cb..3957c645c45336 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,9 +52,12 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _POP_JUMP_IF_FALSE 331 -#define _POP_JUMP_IF_TRUE 332 -#define JUMP_TO_TOP 333 +#define _CHECK_CALL_PY_EXACT_ARGS 331 +#define _INIT_CALL_PY_EXACT_ARGS 332 +#define _PUSH_FRAME 333 +#define _POP_JUMP_IF_FALSE 334 +#define _POP_JUMP_IF_TRUE 335 +#define JUMP_TO_TOP 336 #ifndef NEED_OPCODE_METADATA extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); @@ -951,6 +954,7 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { enum InstructionFormat { INSTR_FMT_IB, INSTR_FMT_IBC, + INSTR_FMT_IBC0, INSTR_FMT_IBC00, INSTR_FMT_IBC000, INSTR_FMT_IBC00000000, @@ -995,6 +999,7 @@ struct opcode_macro_expansion { #define OPARG_CACHE_4 4 #define OPARG_TOP 5 #define OPARG_BOTTOM 6 +#define OPARG_SAVE_IP 7 #define OPCODE_METADATA_FMT(OP) (_PyOpcode_opcode_metadata[(OP)].instr_format) #define SAME_OPCODE_METADATA(OP1, OP2) \ @@ -1336,6 +1341,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 4, .uops = { { _CHECK_CALL_PY_EXACT_ARGS, 2, 1 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1389,6 +1395,9 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", + [_CHECK_CALL_PY_EXACT_ARGS] = "_CHECK_CALL_PY_EXACT_ARGS", + [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", + [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", [_POP_JUMP_IF_TRUE] = "_POP_JUMP_IF_TRUE", [JUMP_TO_TOP] = "JUMP_TO_TOP", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b2281abc6663da..7e6d1167b76c08 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -957,13 +957,13 @@ dummy_func( { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -996,13 +996,13 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2586,7 +2586,6 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -2594,6 +2593,7 @@ dummy_func( SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2944,7 +2944,7 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - inst(CALL_PY_EXACT_ARGS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { + op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- method, callable, unused[oparg])) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); int argcount = oparg; @@ -2958,19 +2958,36 @@ dummy_func( PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != argcount, CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + + op(_INIT_CALL_PY_EXACT_ARGS, (method, callable, args[oparg] -- new_frame: _PyInterpreterFrame*)) { + int is_meth = method != NULL; + int argcount = oparg; + if (is_meth) { + callable = method; + args--; + argcount++; + } STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); for (int i = 0; i < argcount; i++) { new_frame->localsplus[i] = args[i]; } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); + } + + op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { frame->return_offset = 0; DISPATCH_INLINED(new_frame); } - inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { + macro(CALL_PY_EXACT_ARGS) = + unused/1 + // Skip over the counter + _CHECK_CALL_PY_EXACT_ARGS + + _INIT_CALL_PY_EXACT_ARGS + + _PUSH_FRAME; + + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, method, callable, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); int argcount = oparg; diff --git a/Python/executor.c b/Python/executor.c index 4a18618c0c6c0c..cd673a7beeef88 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -30,6 +30,19 @@ #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 +#undef DISPATCH_INLINED +#define DISPATCH_INLINED(NEW_FRAME) \ + do { \ + assert(tstate->interp->eval_frame == NULL); \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + frame->prev_instr -= 1; \ + (NEW_FRAME)->previous = frame; \ + frame = tstate->cframe->current_frame = (NEW_FRAME); \ + CALL_STAT_INC(inlined_py_calls); \ + stack_pointer = _PyFrame_GetStackPointer(frame); \ + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; \ + } while (0) + _PyInterpreterFrame * _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d6d541a3b61ab4..c7fe281895d945 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2153,6 +2153,67 @@ break; } + case _CHECK_CALL_PY_EXACT_ARGS: { + PyObject *self_or_null; + PyObject *callable; + PyObject *method; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + uint32_t func_version = (uint32_t)operand; + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(tstate->interp->eval_frame, CALL); + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + stack_pointer[-2 - oparg] = method; + stack_pointer[-1 - oparg] = callable; + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + PyObject **args; + PyObject *callable; + PyObject *method; + _PyInterpreterFrame *new_frame; + args = stack_pointer - oparg; + callable = stack_pointer[-1 - oparg]; + method = stack_pointer[-2 - oparg]; + int is_meth = method != NULL; + int argcount = oparg; + if (is_meth) { + callable = method; + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } + STACK_SHRINK(oparg); + STACK_SHRINK(1); + stack_pointer[-1] = (PyObject *)new_frame; + break; + } + + case _PUSH_FRAME: { + _PyInterpreterFrame *new_frame; + new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; + STACK_SHRINK(1); + frame->return_offset = 0; + DISPATCH_INLINED(new_frame); + break; + } + case CALL_NO_KW_TYPE_1: { PyObject **args; PyObject *null; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index cf20b869b8182f..2137efeb507e82 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -1191,13 +1191,13 @@ { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -1237,13 +1237,13 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -3341,7 +3341,6 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -3349,6 +3348,7 @@ SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); STACK_GROW(1); } @@ -3758,47 +3758,68 @@ TARGET(CALL_PY_EXACT_ARGS) { PREDICTED(CALL_PY_EXACT_ARGS); - PyObject **args; PyObject *self_or_null; PyObject *callable; - args = stack_pointer - oparg; + PyObject *method; + PyObject **args; + _PyInterpreterFrame *new_frame; + // _CHECK_CALL_PY_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t func_version = read_u32(&next_instr[1].cache); - ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } - DEOPT_IF(!PyFunction_Check(callable), CALL); - PyFunctionObject *func = (PyFunctionObject *)callable; - DEOPT_IF(func->func_version != func_version, CALL); - PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); - DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); - for (int i = 0; i < argcount; i++) { - new_frame->localsplus[i] = args[i]; + { + uint32_t func_version = read_u32(&next_instr[1].cache); + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(tstate->interp->eval_frame, CALL); + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + stack_pointer[-2 - oparg] = method; + stack_pointer[-1 - oparg] = callable; + // _INIT_CALL_PY_EXACT_ARGS + args = stack_pointer - oparg; + callable = stack_pointer[-1 - oparg]; + method = stack_pointer[-2 - oparg]; + { + int is_meth = method != NULL; + int argcount = oparg; + if (is_meth) { + callable = method; + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); - frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + // _PUSH_FRAME STACK_SHRINK(oparg); - STACK_SHRINK(1); + STACK_SHRINK(2); + next_instr += 3; + { + frame->return_offset = 0; + DISPATCH_INLINED(new_frame); + } } TARGET(CALL_PY_WITH_DEFAULTS) { PyObject **args; - PyObject *self_or_null; PyObject *callable; + PyObject *method; args = stack_pointer - oparg; - self_or_null = stack_pointer[-1 - oparg]; - callable = stack_pointer[-2 - oparg]; + callable = stack_pointer[-1 - oparg]; + method = stack_pointer[-2 - oparg]; uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); diff --git a/Python/optimizer.c b/Python/optimizer.c index 6c730aa14b9a47..dbd5142b9101e9 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -602,6 +602,10 @@ translate_bytecode_to_trace( case OPARG_BOTTOM: // Second half of super-instr oparg = orig_oparg & 0xF; break; + case OPARG_SAVE_IP: // op==SAVE_IP; oparg=next instr + oparg = INSTR_IP(instr + offset, code); + break; + default: fprintf(stderr, "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n", @@ -611,6 +615,11 @@ translate_bytecode_to_trace( Py_FatalError("garbled expansion"); } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); + if (expansion->uops[i].uop == _PUSH_FRAME) { + assert(i + 1 == nuops); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } } break; } diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index d35a16a80e8d00..f183845a667003 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -53,6 +53,7 @@ "OPARG_CACHE_4": 4, "OPARG_TOP": 5, "OPARG_BOTTOM": 6, + "OPARG_SAVE_IP": 7, } INSTR_FMT_PREFIX = "INSTR_FMT_" @@ -344,7 +345,7 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part]) + self.write_macro_expansions(instr.name, [part], instr.cache_offset) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -354,7 +355,7 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts) + self.write_macro_expansions(mac.name, mac.parts, mac.cache_offset) case parsing.Pseudo(): pass case _: @@ -428,13 +429,22 @@ def add(name: str) -> None: if instr.kind == "op" and instr.is_viable_uop(): add(instr.name) - def write_macro_expansions(self, name: str, parts: MacroParts) -> None: + def write_macro_expansions( + self, name: str, parts: MacroParts, cache_offset: int + ) -> None: """Write the macro expansions for a macro-instruction.""" # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc. offset = 0 # Cache effect offset expansions: list[tuple[str, int, int]] = [] # [(name, size, offset), ...] for part in parts: if isinstance(part, Component): + # _PUSH_FRAME is super special; it expands to SAVE_IP(next_instr) + _PUSH_FRAME + if part.instr.name == "_PUSH_FRAME": + expansions.append( + ("SAVE_IP", OPARG_SIZES["OPARG_SAVE_IP"], cache_offset) + ) + expansions.append(("_PUSH_FRAME", OPARG_SIZES["OPARG_FULL"], 0)) + continue # All component instructions must be viable uops if not part.instr.is_viable_uop(): # This note just reminds us about macros that cannot diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index aa94dbb07ea1c0..4219fa9c654798 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -59,7 +59,7 @@ class Instruction: block_line: int # First line of block in original code # Computed by constructor - always_exits: bool + always_exits: str # If the block always exits, its last line; else "" has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -120,11 +120,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - # if self.name.startswith("CALL"): - # dprint = print + if "PY_EXACT" in self.name: + dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay + if self.name == "_PUSH_FRAME": + return True # Has DISPATCH_INLINED but it's okay if self.always_exits: dprint(f"Skipping {self.name} because it always exits") return False @@ -322,16 +324,16 @@ def extract_block_text(block: parsing.Block) -> tuple[list[str], bool, int]: return blocklines, check_eval_breaker, block_line -def always_exits(lines: list[str]) -> bool: +def always_exits(lines: list[str]) -> str: """Determine whether a block always ends in a return/goto/etc.""" if not lines: - return False + return "" line = lines[-1].rstrip() # Indent must match exactly (TODO: Do something better) if line[:12] != " " * 12: - return False + return "" line = line[12:] - return line.startswith( + if line.startswith( ( "goto ", "return ", @@ -340,4 +342,6 @@ def always_exits(lines: list[str]) -> bool: "Py_UNREACHABLE()", "ERROR_IF(true, ", ) - ) + ): + return line + return "" diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index d457ce01a8f438..aea6d70e1fcba9 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -295,6 +295,7 @@ def write_single_instr( [Component(instr, instr.active_caches)], out, tier, + 0, ) except AssertionError as err: raise AssertionError(f"Error writing instruction {instr.name}") from err @@ -306,12 +307,14 @@ def write_macro_instr( parts = [part for part in mac.parts if isinstance(part, Component)] cache_adjust = 0 + always_exits = False for part in mac.parts: match part: case CacheEffect(size=size): cache_adjust += size case Component(instr=instr): cache_adjust += instr.cache_offset + always_exits = instr.always_exits case _: typing.assert_never(part) @@ -321,18 +324,20 @@ def write_macro_instr( out.emit(f"PREDICTED({mac.name});") out.static_assert_family_size(mac.name, family, cache_adjust) try: - write_components(parts, out, TIER_ONE) + write_components(parts, out, TIER_ONE, cache_adjust) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") - out.emit("DISPATCH();") + if not always_exits: + if cache_adjust: + out.emit(f"next_instr += {cache_adjust};") + out.emit("DISPATCH();") def write_components( parts: list[Component], out: Formatter, tier: Tiers, + cache_adjust: int, ) -> None: managers = get_managers(parts) @@ -374,13 +379,24 @@ def write_components( poke.as_stack_effect(lax=True), ) + dispatch_inlined_special_case = False + if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME": + dispatch_inlined_special_case = True + temp = mgr.final_offset.clone() + temp.deeper(StackEffect(UNUSED)) # Hack + out.stack_adjust(temp.deep, temp.high) + # Use clone() since adjust_inverse() mutates final_offset + mgr.adjust_inverse(mgr.final_offset.clone()) + if cache_adjust: + out.emit(f"next_instr += {cache_adjust};") + if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) else: with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1]: + if mgr is managers[-1] and not dispatch_inlined_special_case: out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) From 907ff95317c8d7b9221ce95c8823ed8d2cf4861c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 12:00:18 -0700 Subject: [PATCH 02/34] Fix merge so it works again (I think) --- Python/bytecodes.c | 17 +++++------------ Python/executor_cases.c.h | 20 +++++--------------- Python/generated_cases.c.h | 24 +++++++----------------- 3 files changed, 17 insertions(+), 44 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 7e6d1167b76c08..accd4f798d1b26 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2944,27 +2944,20 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- method, callable, unused[oparg])) { + op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - op(_INIT_CALL_PY_EXACT_ARGS, (method, callable, args[oparg] -- new_frame: _PyInterpreterFrame*)) { - int is_meth = method != NULL; + op(_INIT_CALL_PY_EXACT_ARGS, (callable, self_or_null, args[oparg] -- new_frame: _PyInterpreterFrame*)) { int argcount = oparg; - if (is_meth) { - callable = method; + if (self_or_null != NULL) { args--; argcount++; } @@ -2987,7 +2980,7 @@ dummy_func( _INIT_CALL_PY_EXACT_ARGS + _PUSH_FRAME; - inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, method, callable, args[oparg] -- unused)) { + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); int argcount = oparg; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index c7fe281895d945..b4d25a4522ada2 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2156,40 +2156,30 @@ case _CHECK_CALL_PY_EXACT_ARGS: { PyObject *self_or_null; PyObject *callable; - PyObject *method; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; uint32_t func_version = (uint32_t)operand; ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - stack_pointer[-2 - oparg] = method; - stack_pointer[-1 - oparg] = callable; break; } case _INIT_CALL_PY_EXACT_ARGS: { PyObject **args; + PyObject *self_or_null; PyObject *callable; - PyObject *method; _PyInterpreterFrame *new_frame; args = stack_pointer - oparg; - callable = stack_pointer[-1 - oparg]; - method = stack_pointer[-2 - oparg]; - int is_meth = method != NULL; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; int argcount = oparg; - if (is_meth) { - callable = method; + if (self_or_null != NULL) { args--; argcount++; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 2137efeb507e82..21e00e629d22e0 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3760,7 +3760,6 @@ PREDICTED(CALL_PY_EXACT_ARGS); PyObject *self_or_null; PyObject *callable; - PyObject *method; PyObject **args; _PyInterpreterFrame *new_frame; // _CHECK_CALL_PY_EXACT_ARGS @@ -3770,29 +3769,20 @@ uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - stack_pointer[-2 - oparg] = method; - stack_pointer[-1 - oparg] = callable; // _INIT_CALL_PY_EXACT_ARGS args = stack_pointer - oparg; - callable = stack_pointer[-1 - oparg]; - method = stack_pointer[-2 - oparg]; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; { - int is_meth = method != NULL; int argcount = oparg; - if (is_meth) { - callable = method; + if (self_or_null != NULL) { args--; argcount++; } @@ -3815,11 +3805,11 @@ TARGET(CALL_PY_WITH_DEFAULTS) { PyObject **args; + PyObject *self_or_null; PyObject *callable; - PyObject *method; args = stack_pointer - oparg; - callable = stack_pointer[-1 - oparg]; - method = stack_pointer[-2 - oparg]; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); From 2c6be6d65290d85a1c43f4491202dbbbb1e38778 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 12:15:46 -0700 Subject: [PATCH 03/34] Split into finer-grained uops --- Include/internal/pycore_opcode_metadata.h | 20 ++++++++++++-------- Python/bytecodes.c | 16 +++++++++++++--- Python/executor_cases.c.h | 16 ++++++++++++++-- Python/generated_cases.c.h | 13 +++++++++++-- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 3957c645c45336..3ebbf3fcc8c8a2 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,12 +52,14 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _CHECK_CALL_PY_EXACT_ARGS 331 -#define _INIT_CALL_PY_EXACT_ARGS 332 -#define _PUSH_FRAME 333 -#define _POP_JUMP_IF_FALSE 334 -#define _POP_JUMP_IF_TRUE 335 -#define JUMP_TO_TOP 336 +#define _CHECK_PEP_523 331 +#define _CHECK_FUNCTION_EXACT_ARGS 332 +#define _CHECK_STACK_SPACE 333 +#define _INIT_CALL_PY_EXACT_ARGS 334 +#define _PUSH_FRAME 335 +#define _POP_JUMP_IF_FALSE 336 +#define _POP_JUMP_IF_TRUE 337 +#define JUMP_TO_TOP 338 #ifndef NEED_OPCODE_METADATA extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); @@ -1341,7 +1343,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, - [CALL_PY_EXACT_ARGS] = { .nuops = 4, .uops = { { _CHECK_CALL_PY_EXACT_ARGS, 2, 1 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 6, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1395,7 +1397,9 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", - [_CHECK_CALL_PY_EXACT_ARGS] = "_CHECK_CALL_PY_EXACT_ARGS", + [_CHECK_PEP_523] = "_CHECK_PEP_523", + [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", + [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", diff --git a/Python/bytecodes.c b/Python/bytecodes.c index accd4f798d1b26..48e523264d8e62 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2944,14 +2944,22 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - op(_CHECK_CALL_PY_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { - ASSERT_KWNAMES_IS_NULL(); + op(_CHECK_PEP_523, (--)) { DEOPT_IF(tstate->interp->eval_frame, CALL); + } + + op(_CHECK_FUNCTION_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + + op(_CHECK_STACK_SPACE, (callable, unused, unused[oparg] -- callable, unused, unused[oparg])) { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } @@ -2976,7 +2984,9 @@ dummy_func( macro(CALL_PY_EXACT_ARGS) = unused/1 + // Skip over the counter - _CHECK_CALL_PY_EXACT_ARGS + + _CHECK_PEP_523 + + _CHECK_FUNCTION_EXACT_ARGS + + _CHECK_STACK_SPACE + _INIT_CALL_PY_EXACT_ARGS + _PUSH_FRAME; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index b4d25a4522ada2..d76c66ee304df1 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2153,19 +2153,31 @@ break; } - case _CHECK_CALL_PY_EXACT_ARGS: { + case _CHECK_PEP_523: { + DEOPT_IF(tstate->interp->eval_frame, CALL); + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { PyObject *self_or_null; PyObject *callable; self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; uint32_t func_version = (uint32_t)operand; ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + break; + } + + case _CHECK_STACK_SPACE: { + PyObject *callable; + callable = stack_pointer[-2 - oparg]; + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 21e00e629d22e0..f13b9066f937b2 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3762,18 +3762,27 @@ PyObject *callable; PyObject **args; _PyInterpreterFrame *new_frame; - // _CHECK_CALL_PY_EXACT_ARGS + // _CHECK_PEP_523 + { + DEOPT_IF(tstate->interp->eval_frame, CALL); + } + // _CHECK_FUNCTION_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; { uint32_t func_version = read_u32(&next_instr[1].cache); ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + // _CHECK_STACK_SPACE + callable = stack_pointer[-2 - oparg]; + { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } // _INIT_CALL_PY_EXACT_ARGS From 6d78ff2926d558de0508be62c8f23c99962202b5 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 17:24:45 -0700 Subject: [PATCH 04/34] Fix type error in stacking.py --- Tools/cases_generator/stacking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index aea6d70e1fcba9..86b9ee92c80be5 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -307,7 +307,7 @@ def write_macro_instr( parts = [part for part in mac.parts if isinstance(part, Component)] cache_adjust = 0 - always_exits = False + always_exits = "" for part in mac.parts: match part: case CacheEffect(size=size): From 0d8e66c91da65571bd990a3cf9e092fa68d264fb Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 17:44:59 -0700 Subject: [PATCH 05/34] Add test --- Lib/test/test_capi/test_misc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 001d37de8e0eb3..36b8e1fa993b98 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2618,6 +2618,23 @@ def testfunc(it): with self.assertRaises(StopIteration): next(it) + def test_call_py_exact_args(self): + def testfunc(n): + def dummy(x): + return x+1 + for i in range(n): + dummy(i) + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(10) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_PUSH_FRAME", uops) + + if __name__ == "__main__": unittest.main() From b75f30eb37c8ac68a014a7a3318e549e9e4d4ea3 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 9 Aug 2023 18:21:20 -0700 Subject: [PATCH 06/34] Add comment explaining _PUSH_FRAME's unused output effect --- Python/bytecodes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 48e523264d8e62..e749348bdae488 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2977,6 +2977,9 @@ dummy_func( } } + // The 'unused' output effect represents the return value + // (which will be pushed when the frame returns). + // It is needed so CALL_PY_EXACT_ARGS matches its family. op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { frame->return_offset = 0; DISPATCH_INLINED(new_frame); From 61c2822cde6b709855c1052df38ed05cb059b7af Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 07:53:35 -0700 Subject: [PATCH 07/34] Make PUSH_FRAME special case a little less myterious --- Tools/cases_generator/stacking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index 86b9ee92c80be5..d9eaacfcbeb974 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -382,9 +382,8 @@ def write_components( dispatch_inlined_special_case = False if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME": dispatch_inlined_special_case = True - temp = mgr.final_offset.clone() - temp.deeper(StackEffect(UNUSED)) # Hack - out.stack_adjust(temp.deep, temp.high) + # Adjust stack to min_offset (input effects materialized) + out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) if cache_adjust: @@ -397,6 +396,7 @@ def write_components( mgr.instr.write_body(out, -4, mgr.active_caches, tier) if mgr is managers[-1] and not dispatch_inlined_special_case: + # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) From f73ea90f72f9eeef25f6eb9ada1b83a6cbca54e1 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 15:46:09 -0700 Subject: [PATCH 08/34] Rename Instruction.write to write_case_body --- Tools/cases_generator/generate_cases.py | 5 +++-- Tools/cases_generator/instructions.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index f183845a667003..3563f16a5978a6 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -22,6 +22,7 @@ PseudoInstruction, StackEffect, OverriddenInstructionPlaceHolder, + TIER_ONE, TIER_TWO, ) import parsing @@ -597,7 +598,7 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write(self.out, tier=TIER_TWO) + instr.write_case_body(self.out, tier=TIER_TWO) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -630,7 +631,7 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - instr.write(self.out) + instr.write_case_body(self.out, tier=TIER_ONE) if not instr.always_exits: if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 4219fa9c654798..c52f0dcb0ced62 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -142,7 +142,7 @@ def is_viable_uop(self) -> bool: res = False return res - def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None: + def write_case_body(self, out: Formatter, tier: Tiers) -> None: """Write one instruction, sans prologue and epilogue.""" # Write a static assertion that a family's cache size is correct From 12910fcd18be115e99a7575d38c5e7967a97fe45 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:01:28 -0700 Subject: [PATCH 09/34] Move next_instr update to a more logical place --- Tools/cases_generator/generate_cases.py | 2 ++ Tools/cases_generator/instructions.py | 8 -------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 3563f16a5978a6..d84ee47bc8a5bf 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -633,6 +633,8 @@ def write_instr(self, instr: Instruction) -> None: self.out.emit(f"PREDICTED({name});") instr.write_case_body(self.out, tier=TIER_ONE) if not instr.always_exits: + if instr.cache_offset: + self.out.emit(f"next_instr += {instr.cache_offset};") if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit(f"DISPATCH();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index c52f0dcb0ced62..7094320a4a3862 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -151,14 +151,6 @@ def write_case_body(self, out: Formatter, tier: Tiers) -> None: # Write input stack effect variable declarations and initializations stacking.write_single_instr(self, out, tier) - # Skip the rest if the block always exits - if self.always_exits: - return - - # Write cache effect - if tier == TIER_ONE and self.cache_offset: - out.emit(f"next_instr += {self.cache_offset};") - def write_body( self, out: Formatter, From 2fafa2c16116af63834792c06ba88d8f2c22043f Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:19:06 -0700 Subject: [PATCH 10/34] Don't recompute macro cache offset --- Tools/cases_generator/stacking.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index d9eaacfcbeb974..ad63bbbdb0505c 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -305,31 +305,18 @@ def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: parts = [part for part in mac.parts if isinstance(part, Component)] - - cache_adjust = 0 - always_exits = "" - for part in mac.parts: - match part: - case CacheEffect(size=size): - cache_adjust += size - case Component(instr=instr): - cache_adjust += instr.cache_offset - always_exits = instr.always_exits - case _: - typing.assert_never(part) - out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: out.emit(f"PREDICTED({mac.name});") - out.static_assert_family_size(mac.name, family, cache_adjust) + out.static_assert_family_size(mac.name, family, mac.cache_offset) try: - write_components(parts, out, TIER_ONE, cache_adjust) + write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if not always_exits: - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") + if not parts[-1].instr.always_exits: + if mac.cache_offset: + out.emit(f"next_instr += {mac.cache_offset};") out.emit("DISPATCH();") @@ -337,7 +324,7 @@ def write_components( parts: list[Component], out: Formatter, tier: Tiers, - cache_adjust: int, + cache_offset: int, ) -> None: managers = get_managers(parts) @@ -386,8 +373,8 @@ def write_components( out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") + if cache_offset: + out.emit(f"next_instr += {cache_offset};") if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) From 2717b0738b3c25b996288f19579d125c9ffda830 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:22:58 -0700 Subject: [PATCH 11/34] Fold and refactor long line in stacking.py --- Tools/cases_generator/stacking.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index ad63bbbdb0505c..c4f5ee29ef41af 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -366,9 +366,12 @@ def write_components( poke.as_stack_effect(lax=True), ) - dispatch_inlined_special_case = False - if mgr is managers[-1] and mgr.instr.always_exits.startswith("DISPATCH_INLINED") and mgr.instr.name == "_PUSH_FRAME": - dispatch_inlined_special_case = True + dispatch_inlined_special_case = ( + mgr is managers[-1] + and mgr.instr.always_exits.startswith("DISPATCH_INLINED") + and mgr.instr.name == "_PUSH_FRAME" + ) + if dispatch_inlined_special_case: # Adjust stack to min_offset (input effects materialized) out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset From e48790885a448f1065411ad0eda0e69ca4896808 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:24:33 -0700 Subject: [PATCH 12/34] Fold long lines in generate_cases.py --- Tools/cases_generator/generate_cases.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index d84ee47bc8a5bf..c0a11bcd23dd7f 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -346,7 +346,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part], instr.cache_offset) + self.write_macro_expansions( + instr.name, [part], instr.cache_offset + ) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -356,7 +358,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts, mac.cache_offset) + self.write_macro_expansions( + mac.name, mac.parts, mac.cache_offset + ) case parsing.Pseudo(): pass case _: @@ -562,7 +566,9 @@ def write_instructions( case parsing.Macro(): n_macros += 1 mac = self.macro_instrs[thing.name] - stacking.write_macro_instr(mac, self.out, self.families.get(mac.name)) + stacking.write_macro_instr( + mac, self.out, self.families.get(mac.name) + ) # self.write_macro(self.macro_instrs[thing.name]) case parsing.Pseudo(): pass From 1d549af5f4b0d28f22da4561ba0d1ba16e226823 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:52:09 -0700 Subject: [PATCH 13/34] Don't emit static assert to executor cases --- Python/executor_cases.c.h | 9 --------- Tools/cases_generator/generate_cases.py | 1 + Tools/cases_generator/instructions.py | 5 ----- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d76c66ee304df1..f63c5c60fdc085 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -103,7 +103,6 @@ } case TO_BOOL: { - static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; value = stack_pointer[-1]; @@ -363,7 +362,6 @@ } case BINARY_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *res; @@ -557,7 +555,6 @@ } case STORE_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *v; @@ -862,7 +859,6 @@ } case UNPACK_SEQUENCE: { - static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; seq = stack_pointer[-1]; #if ENABLE_SPECIALIZATION @@ -950,7 +946,6 @@ } case STORE_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size"); PyObject *owner; PyObject *v; owner = stack_pointer[-1]; @@ -1061,7 +1056,6 @@ } case LOAD_GLOBAL: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size"); PyObject *res; PyObject *null = NULL; #if ENABLE_SPECIALIZATION @@ -1554,7 +1548,6 @@ } case LOAD_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; PyObject *attr; PyObject *self_or_null = NULL; @@ -1648,7 +1641,6 @@ } case COMPARE_OP: { - static_assert(INLINE_CACHE_ENTRIES_COMPARE_OP == 1, "incorrect cache size"); PyObject *right; PyObject *left; PyObject *res; @@ -2717,7 +2709,6 @@ } case BINARY_OP: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_OP == 1, "incorrect cache size"); PyObject *rhs; PyObject *lhs; PyObject *res; diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index c0a11bcd23dd7f..6e80a06da386d1 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -637,6 +637,7 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") + self.out.static_assert_family_size(instr.name, instr.family, instr.cache_offset) instr.write_case_body(self.out, tier=TIER_ONE) if not instr.always_exits: if instr.cache_offset: diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 7094320a4a3862..27cfb2c20ea9e0 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -144,11 +144,6 @@ def is_viable_uop(self) -> bool: def write_case_body(self, out: Formatter, tier: Tiers) -> None: """Write one instruction, sans prologue and epilogue.""" - - # Write a static assertion that a family's cache size is correct - out.static_assert_family_size(self.name, self.family, self.cache_offset) - - # Write input stack effect variable declarations and initializations stacking.write_single_instr(self, out, tier) def write_body( From f40fb1f82267bcc39b0ded8b5e1f2f0b731a24f9 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Aug 2023 16:55:09 -0700 Subject: [PATCH 14/34] Factor away write_case_body (formerly Instruction.write) --- Tools/cases_generator/generate_cases.py | 4 ++-- Tools/cases_generator/instructions.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 6e80a06da386d1..32344ffa8c011e 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -604,7 +604,7 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write_case_body(self.out, tier=TIER_TWO) + stacking.write_single_instr(instr, self.out, tier=TIER_TWO) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -638,7 +638,7 @@ def write_instr(self, instr: Instruction) -> None: if instr.predicted: self.out.emit(f"PREDICTED({name});") self.out.static_assert_family_size(instr.name, instr.family, instr.cache_offset) - instr.write_case_body(self.out, tier=TIER_ONE) + stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: if instr.cache_offset: self.out.emit(f"next_instr += {instr.cache_offset};") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 27cfb2c20ea9e0..1fb291eeca1d0c 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -142,10 +142,6 @@ def is_viable_uop(self) -> bool: res = False return res - def write_case_body(self, out: Formatter, tier: Tiers) -> None: - """Write one instruction, sans prologue and epilogue.""" - stacking.write_single_instr(self, out, tier) - def write_body( self, out: Formatter, From 4f6f8f8f44761ae3945cc873497bbfe2ec07102b Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 14:58:02 -0700 Subject: [PATCH 15/34] Fold long lines --- Tools/cases_generator/generate_cases.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 32344ffa8c011e..2a695c45684cc1 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -604,7 +604,9 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - stacking.write_single_instr(instr, self.out, tier=TIER_TWO) + stacking.write_single_instr( + instr, self.out, tier=TIER_TWO + ) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -637,7 +639,9 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - self.out.static_assert_family_size(instr.name, instr.family, instr.cache_offset) + self.out.static_assert_family_size( + instr.name, instr.family, instr.cache_offset + ) stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: if instr.cache_offset: From 6facc8dd838998999c117ddf8c225ad09bebfe47 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 14:19:21 -0700 Subject: [PATCH 16/34] Make less of a special case of _PUSH_FRAME Instead, the special case is an opcode using SAVE_FRAME_STATE(). Introducing #if TIER_ONE and #if TIER_TWO so we can implement _PUSH_FRAME differently for both tiers. --- Python/bytecodes.c | 16 +++++++++++++++- Python/ceval.c | 1 + Python/ceval_macros.h | 9 +++++++-- Python/executor.c | 18 +++++++----------- Python/executor_cases.c.h | 16 +++++++++++++++- Python/generated_cases.c.h | 16 +++++++++++++++- Tools/cases_generator/instructions.py | 8 ++++---- Tools/cases_generator/stacking.py | 12 ++++-------- 8 files changed, 68 insertions(+), 28 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e749348bdae488..9919c4141c9099 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2981,8 +2981,22 @@ dummy_func( // (which will be pushed when the frame returns). // It is needed so CALL_PY_EXACT_ARGS matches its family. op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } macro(CALL_PY_EXACT_ARGS) = diff --git a/Python/ceval.c b/Python/ceval.c index b966399a342d08..2370636d765d9c 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -770,6 +770,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif { +#define TIER_ONE 1 #include "generated_cases.c.h" /* INSTRUMENTED_LINE has to be here, rather than in bytecodes.c, diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 8dc8b754485856..77f760f0bb5995 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -103,11 +103,16 @@ DISPATCH_GOTO(); \ } +#define SAVE_FRAME_STATE() \ + do { \ + frame->prev_instr = next_instr - 1; \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + } while (0) + #define DISPATCH_INLINED(NEW_FRAME) \ do { \ assert(tstate->interp->eval_frame == NULL); \ - _PyFrame_SetStackPointer(frame, stack_pointer); \ - frame->prev_instr = next_instr - 1; \ + SAVE_FRAME_STATE(); \ (NEW_FRAME)->previous = frame; \ frame = cframe.current_frame = (NEW_FRAME); \ CALL_STAT_INC(inlined_py_calls); \ diff --git a/Python/executor.c b/Python/executor.c index cd673a7beeef88..c18ba6d4ade3e5 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -30,17 +30,12 @@ #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 -#undef DISPATCH_INLINED -#define DISPATCH_INLINED(NEW_FRAME) \ - do { \ - assert(tstate->interp->eval_frame == NULL); \ - _PyFrame_SetStackPointer(frame, stack_pointer); \ - frame->prev_instr -= 1; \ - (NEW_FRAME)->previous = frame; \ - frame = tstate->cframe->current_frame = (NEW_FRAME); \ - CALL_STAT_INC(inlined_py_calls); \ - stack_pointer = _PyFrame_GetStackPointer(frame); \ - ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; \ +#undef SAVE_FRAME_STATE +#define SAVE_FRAME_STATE() \ + do { \ + /* Assume preceding SAVE_IP has set frame->prev_instr */ \ + frame->prev_instr--; \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ } while (0) @@ -94,6 +89,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject OBJECT_STAT_INC(optimization_uops_executed); switch (opcode) { +#define TIER_TWO 2 #include "executor_cases.c.h" default: diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index f63c5c60fdc085..4ace06857eb424 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2203,8 +2203,22 @@ _PyInterpreterFrame *new_frame; new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; STACK_SHRINK(1); + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index f13b9066f937b2..a42457d05faf03 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3807,8 +3807,22 @@ STACK_SHRINK(2); next_instr += 3; { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } } diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 1fb291eeca1d0c..6c52cf3e8a3d09 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -60,6 +60,7 @@ class Instruction: # Computed by constructor always_exits: str # If the block always exits, its last line; else "" + save_frame_state: bool # Whether the instruction uses SAVE_FRAME_STATE() has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -83,6 +84,7 @@ def __init__(self, inst: parsing.InstDef): self.block ) self.always_exits = always_exits(self.block_text) + self.save_frame_state = variable_used(self.inst, "SAVE_FRAME_STATE") self.has_deopt = variable_used(self.inst, "DEOPT_IF") self.cache_effects = [ effect for effect in inst.inputs if isinstance(effect, parsing.CacheEffect) @@ -120,15 +122,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - if "PY_EXACT" in self.name: + if "PUSH_FRAME" in self.name: dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay - if self.name == "_PUSH_FRAME": - return True # Has DISPATCH_INLINED but it's okay if self.always_exits: - dprint(f"Skipping {self.name} because it always exits") + dprint(f"Skipping {self.name} because it always exits: {self.always_exits}") return False if len(self.active_caches) > 1: # print(f"Skipping {self.name} because it has >1 cache entries") diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index c4f5ee29ef41af..a721cba0247a8a 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -1,6 +1,7 @@ import dataclasses import typing +from flags import variable_used_unspecialized from formatting import ( Formatter, UNUSED, @@ -314,7 +315,7 @@ def write_macro_instr( write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if not parts[-1].instr.always_exits: + if not parts[-1].instr.always_exits and not parts[-1].instr.save_frame_state: if mac.cache_offset: out.emit(f"next_instr += {mac.cache_offset};") out.emit("DISPATCH();") @@ -366,12 +367,7 @@ def write_components( poke.as_stack_effect(lax=True), ) - dispatch_inlined_special_case = ( - mgr is managers[-1] - and mgr.instr.always_exits.startswith("DISPATCH_INLINED") - and mgr.instr.name == "_PUSH_FRAME" - ) - if dispatch_inlined_special_case: + if mgr.instr.save_frame_state: # Adjust stack to min_offset (input effects materialized) out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) # Use clone() since adjust_inverse() mutates final_offset @@ -385,7 +381,7 @@ def write_components( with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1] and not dispatch_inlined_special_case: + if mgr is managers[-1] and not mgr.instr.save_frame_state: # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset From 94630d49b3283657953d44b7fbe2a8a36d624721 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 14:45:31 -0700 Subject: [PATCH 17/34] Stop special-casing _PUSH_FRAME altogether Instead, we special-case SAVE_IP: - Its Tier 2 expansion sets oparg to the instruction offset - In Tier 1 it is a no-op (and skipped if present in a macro) --- Python/bytecodes.c | 1 + Tools/cases_generator/generate_cases.py | 12 ++++-------- Tools/cases_generator/stacking.py | 16 ++++++++++++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 9919c4141c9099..cbbd6d48daae75 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3005,6 +3005,7 @@ dummy_func( _CHECK_FUNCTION_EXACT_ARGS + _CHECK_STACK_SPACE + _INIT_CALL_PY_EXACT_ARGS + + SAVE_IP + // Tier 2 only; special-cased oparg _PUSH_FRAME; inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index 2a695c45684cc1..6050fcf84479e9 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -443,13 +443,6 @@ def write_macro_expansions( expansions: list[tuple[str, int, int]] = [] # [(name, size, offset), ...] for part in parts: if isinstance(part, Component): - # _PUSH_FRAME is super special; it expands to SAVE_IP(next_instr) + _PUSH_FRAME - if part.instr.name == "_PUSH_FRAME": - expansions.append( - ("SAVE_IP", OPARG_SIZES["OPARG_SAVE_IP"], cache_offset) - ) - expansions.append(("_PUSH_FRAME", OPARG_SIZES["OPARG_FULL"], 0)) - continue # All component instructions must be viable uops if not part.instr.is_viable_uop(): # This note just reminds us about macros that cannot @@ -463,7 +456,10 @@ def write_macro_expansions( ) return if not part.active_caches: - size, offset = OPARG_SIZES["OPARG_FULL"], 0 + if part.instr.name == "SAVE_IP": + size, offset = OPARG_SIZES["OPARG_SAVE_IP"], cache_offset + else: + size, offset = OPARG_SIZES["OPARG_FULL"], 0 else: # If this assert triggers, is_viable_uops() lied assert len(part.active_caches) == 1, (name, part.instr.name) diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index a721cba0247a8a..7154d8fbab454f 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -147,6 +147,8 @@ class EffectManager: # Track offsets from stack pointer min_offset: StackOffset final_offset: StackOffset + # Link to previous manager + pred: "EffectManager | None" = None def __init__( self, @@ -168,7 +170,8 @@ def __init__( self.pokes.append(StackItem(offset=self.final_offset.clone(), effect=eff)) self.final_offset.higher(eff) - if pred: + self.pred = pred + while pred: # Replace push(x) + pop(y) with copy(x, y). # Check that the sources and destinations are disjoint. sources: set[str] = set() @@ -193,6 +196,11 @@ def __init__( sources, destinations, ) + # See if we can get more copies of a earlier predecessor. + if self.peeks and not pred.pokes and not pred.peeks: + pred = pred.pred + else: + pred = None # Break def adjust_deeper(self, eff: StackEffect) -> None: for peek in self.peeks: @@ -305,7 +313,11 @@ def write_single_instr( def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: - parts = [part for part in mac.parts if isinstance(part, Component)] + parts = [ + part + for part in mac.parts + if isinstance(part, Component) and part.instr.name != "SAVE_IP" + ] out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: From cf8e2c02d191d3828366b24c81ee4b4c213d2ef0 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 15 Aug 2023 13:20:38 -0700 Subject: [PATCH 18/34] Call _Py_EnterRecursivePy in _FRAME_PUSH --- Python/bytecodes.c | 1 + Python/ceval.c | 5 ----- Python/ceval_macros.h | 5 +++++ Python/executor.c | 1 + Python/executor_cases.c.h | 1 + Python/generated_cases.c.h | 1 + 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index cbbd6d48daae75..0ad08aafd0ee2a 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2994,6 +2994,7 @@ dummy_func( #endif #if TIER_TWO frame = tstate->cframe->current_frame = new_frame; + ERROR_IF(_Py_EnterRecursivePy(tstate), xz); stack_pointer = _PyFrame_GetStackPointer(frame); ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; #endif diff --git a/Python/ceval.c b/Python/ceval.c index 2370636d765d9c..26e741ed7c7547 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -602,11 +602,6 @@ int _Py_CheckRecursiveCallPy( return 0; } -static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { - return (tstate->py_recursion_remaining-- <= 0) && - _Py_CheckRecursiveCallPy(tstate); -} - static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { tstate->py_recursion_remaining++; diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 77f760f0bb5995..67afd5c12d5ce3 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -369,3 +369,8 @@ static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = { #else #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL) #endif + +static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { + return (tstate->py_recursion_remaining-- <= 0) && + _Py_CheckRecursiveCallPy(tstate); +} diff --git a/Python/executor.c b/Python/executor.c index c18ba6d4ade3e5..1aa0578b5f899c 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -115,6 +115,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject pop_2_error: STACK_SHRINK(1); pop_1_error: +pop_1_exit_unwind: STACK_SHRINK(1); error: // On ERROR_IF we return NULL as the frame. diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 4ace06857eb424..15c7e5b6fd0589 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2216,6 +2216,7 @@ #endif #if TIER_TWO frame = tstate->cframe->current_frame = new_frame; + if (_Py_EnterRecursivePy(tstate)) goto pop_1_exit_unwind; stack_pointer = _PyFrame_GetStackPointer(frame); ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; #endif diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index a42457d05faf03..910a10048d9919 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3820,6 +3820,7 @@ #endif #if TIER_TWO frame = tstate->cframe->current_frame = new_frame; + if (_Py_EnterRecursivePy(tstate)) goto pop_1_exit_unwind; stack_pointer = _PyFrame_GetStackPointer(frame); ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; #endif From 337a2ebc12da34301b4ce77bfcaed41dcffbfe7e Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 8 Aug 2023 12:09:32 -0700 Subject: [PATCH 19/34] Add function-by-version cache --- Include/internal/pycore_function.h | 9 +++++++ Objects/funcobject.c | 41 ++++++++++++++++++++++++++++-- Python/bytecodes.c | 3 ++- Python/executor_cases.c.h | 3 ++- Python/generated_cases.c.h | 3 ++- Python/sysmodule.c | 2 ++ 6 files changed, 56 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index e844d323ec7927..d059c3731b435f 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -16,13 +16,22 @@ extern PyObject* _PyFunction_Vectorcall( #define FUNC_MAX_WATCHERS 8 +#define FUNC_VERSION_CACHE_SIZE (1<<12) /* Must be a power of 2 */ struct _py_func_state { uint32_t next_version; + // Function objects whose func_version % FUNC_VERSION_CACHE_SIZE + // once equaled the index in the table. The references are owned: + // Call _PyFunction_ClearByVersionCache() to clear. + PyFunctionObject *func_version_cache[FUNC_VERSION_CACHE_SIZE]; }; extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr); extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); +extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); +PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); +void _PyFunction_ClearByVersionCache(void); + extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 8c0bface3ac710..10655fd3def4f3 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -223,7 +223,44 @@ PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname return NULL; } -uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) +void +_PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) +{ + func->func_version = version; + if (version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (version % FUNC_VERSION_CACHE_SIZE); + Py_XSETREF(*slot, (PyFunctionObject *)Py_NewRef(func)); + } +} + +PyFunctionObject * +_PyFunction_LookupByVersion(uint32_t version) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (version % FUNC_VERSION_CACHE_SIZE); + if (*slot != NULL && (*slot)->func_version == version) { + return (PyFunctionObject *)Py_NewRef(*slot); + } + return NULL; +} + +void +_PyFunction_ClearByVersionCache(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + for (int i = 0; i < FUNC_VERSION_CACHE_SIZE; i++) { + PyFunctionObject **slot = interp->func_state.func_version_cache + i; + Py_CLEAR(*slot); + } +} + +uint32_t +_PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { if (func->func_version != 0) { return func->func_version; @@ -236,7 +273,7 @@ uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) return 0; } uint32_t v = interp->func_state.next_version++; - func->func_version = v; + _PyFunction_SetVersion(func, v); return v; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index dc6ac36f4dfb3f..09ca586c8ff401 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3549,7 +3549,8 @@ dummy_func( goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 7e58c3f7b4d236..4147c7b2daa8a4 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -2608,7 +2608,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; break; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 4628bbf6dc3efe..b0e418afacee6d 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4568,7 +4568,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; DISPATCH(); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index be026d95ba7e77..f6511fd7b0c9dc 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2079,6 +2079,8 @@ sys__clear_type_cache_impl(PyObject *module) /*[clinic end generated code: output=20e48ca54a6f6971 input=127f3e04a8d9b555]*/ { PyType_ClearCache(); + // Also clear the function-by-version cache + _PyFunction_ClearByVersionCache(); Py_RETURN_NONE; } From 0218222af8b8c964c7bbef8fe54d475aa75f2aff Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 8 Aug 2023 14:54:46 -0700 Subject: [PATCH 20/34] Trace into function calls - This uses the function-by-version cache I just added - There's not yet a way to trace back via `RETURN_VALUE` --- Python/optimizer.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Python/optimizer.c b/Python/optimizer.c index 559c4ae987263e..dd30d53063b4c6 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -448,6 +448,7 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); +top: // Jump here after _PUSH_FRAME for (;;) { RESERVE_RAW(2, "epilogue"); // Always need space for SAVE_IP and EXIT_TRACE ADD_TO_TRACE(SAVE_IP, INSTR_IP(instr, code), 0); @@ -621,6 +622,34 @@ translate_bytecode_to_trace( ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); if (expansion->uops[i].uop == _PUSH_FRAME) { assert(i + 1 == nuops); + int func_version_offset = + offsetof(_PyCallCache, func_version)/sizeof(_Py_CODEUNIT) + // Add one to account for the actual opcode/oparg pair: + + 1; + uint32_t func_version = read_u32(&instr[func_version_offset].cache); + PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); + DPRINTF(3, "Function object: %p\n", func); + if (func != NULL) { + PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); + if (new_code == code) { + // Recursive call, bail (we could be here forever). + DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", + PyUnicode_AsUTF8(new_code->co_qualname), + PyUnicode_AsUTF8(new_code->co_filename), + new_code->co_firstlineno); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } + code = new_code; + initial_instr = instr = _PyCode_CODE(code); + DPRINTF(2, + "Continuing in %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(initial_instr, code)); + goto top; + } ADD_TO_TRACE(SAVE_IP, 0, 0); goto done; } From ab74ef0df79b8b3aa9bbcf360b538d4b5f07a603 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 11 Aug 2023 18:40:41 -0700 Subject: [PATCH 21/34] Make RESUME a viable uop --- Include/internal/pycore_opcode_metadata.h | 1 + Lib/test/test_capi/test_misc.py | 1 + Python/bytecodes.c | 5 ++++- Python/executor_cases.c.h | 18 ++++++++++++++++++ Python/generated_cases.c.h | 5 ++++- Tools/cases_generator/flags.py | 2 +- Tools/cases_generator/instructions.py | 4 ++-- 7 files changed, 31 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 4b6f274973e848..dec41fccca5fbd 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1395,6 +1395,7 @@ extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACR #ifdef NEED_OPCODE_METADATA const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPANSION_SIZE] = { [NOP] = { .nuops = 1, .uops = { { NOP, 0, 0 } } }, + [RESUME] = { .nuops = 1, .uops = { { RESUME, 0, 0 } } }, [LOAD_FAST_CHECK] = { .nuops = 1, .uops = { { LOAD_FAST_CHECK, 0, 0 } } }, [LOAD_FAST] = { .nuops = 1, .uops = { { LOAD_FAST, 0, 0 } } }, [LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { LOAD_FAST_AND_CLEAR, 0, 0 } } }, diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 3dfbfdc26e7416..18a0476122dabf 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2633,6 +2633,7 @@ def dummy(x): self.assertIsNotNone(ex) uops = {opname for opname, _, _ in ex} self.assertIn("_PUSH_FRAME", uops) + self.assertIn("_BINARY_OP_ADD_INT", uops) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 09ca586c8ff401..11da07378cdc37 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -135,6 +135,7 @@ dummy_func( } inst(RESUME, (--)) { + #if TIER_ONE assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); /* Possibly combine this with eval breaker */ @@ -143,7 +144,9 @@ dummy_func( ERROR_IF(err, error); next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 4147c7b2daa8a4..94138e9b459989 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7,6 +7,24 @@ break; } + case RESUME: { + #if TIER_ONE + assert(tstate->cframe == &cframe); + assert(frame == cframe.current_frame); + /* Possibly combine this with eval breaker */ + if (_PyFrame_GetCode(frame)->_co_instrumentation_version != tstate->interp->monitoring_version) { + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + if (err) goto error; + next_instr--; + } + else + #endif + if (oparg < 2) { + CHECK_EVAL_BREAKER(); + } + break; + } + case LOAD_FAST_CHECK: { PyObject *value; value = GETLOCAL(oparg); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index b0e418afacee6d..63a45212a17a0a 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -8,6 +8,7 @@ } TARGET(RESUME) { + #if TIER_ONE assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); /* Possibly combine this with eval breaker */ @@ -16,7 +17,9 @@ if (err) goto error; next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } DISPATCH(); diff --git a/Tools/cases_generator/flags.py b/Tools/cases_generator/flags.py index f7ebdeb0d65677..962f003b194dbd 100644 --- a/Tools/cases_generator/flags.py +++ b/Tools/cases_generator/flags.py @@ -92,7 +92,7 @@ def variable_used_unspecialized(node: parsing.Node, name: str) -> bool: if text == "#if": if ( i + 1 < len(node.tokens) - and node.tokens[i + 1].text == "ENABLE_SPECIALIZATION" + and node.tokens[i + 1].text in ("ENABLE_SPECIALIZATION", "TIER_ONE") ): skipping = True elif text in ("#else", "#endif"): diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index 7dabff942f4769..2314fed1769a60 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -122,8 +122,8 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - if "PUSH_FRAME" in self.name: - dprint = print + # if "RESUME" in self.name: + # dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay From 7e6ef784ab805e08e00560f17a6fc71ed1bd093d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 12:57:13 -0700 Subject: [PATCH 22/34] Clear func_version_cache in interpreter_clear() This should fix leaks and hopefully most failing tests. --- Include/internal/pycore_function.h | 2 +- Objects/funcobject.c | 3 +-- Python/pystate.c | 3 +++ Python/sysmodule.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index d059c3731b435f..cb3f856410d0d1 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -30,7 +30,7 @@ extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr) extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); -void _PyFunction_ClearByVersionCache(void); +void _PyFunction_ClearByVersionCache(PyInterpreterState *interp); extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 10655fd3def4f3..82c358301aa8bf 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -250,9 +250,8 @@ _PyFunction_LookupByVersion(uint32_t version) } void -_PyFunction_ClearByVersionCache(void) +_PyFunction_ClearByVersionCache(PyInterpreterState *interp) { - PyInterpreterState *interp = _PyInterpreterState_GET(); for (int i = 0; i < FUNC_VERSION_CACHE_SIZE; i++) { PyFunctionObject **slot = interp->func_state.func_version_cache + i; Py_CLEAR(*slot); diff --git a/Python/pystate.c b/Python/pystate.c index 3a05cb0fa7988d..321abe7a6c633b 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -840,6 +840,9 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) */ // XXX Make sure we properly deal with problematic finalizers. + interp->func_state.next_version = 0; // No more new versions + _PyFunction_ClearByVersionCache(interp); + Py_CLEAR(interp->audit_hooks); for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { diff --git a/Python/sysmodule.c b/Python/sysmodule.c index f6511fd7b0c9dc..f8cb8e6d61a048 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2080,7 +2080,8 @@ sys__clear_type_cache_impl(PyObject *module) { PyType_ClearCache(); // Also clear the function-by-version cache - _PyFunction_ClearByVersionCache(); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyFunction_ClearByVersionCache(interp); Py_RETURN_NONE; } From 120001477f54b5ed22cd362de09e63bef0d56fc5 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 14:12:04 -0700 Subject: [PATCH 23/34] Add a small essay on function and code versions --- Objects/funcobject.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 82c358301aa8bf..6a1d38da169163 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -223,6 +223,48 @@ PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname return NULL; } +/* +Function versions +----------------- + +Function versions are used to detect when a function object has been +updated, invalidating inline cache data used by the `CALL` bytecode +(notably `CALL_PY_EXACT_ARGS` and a few other `CALL` specializations). + +They are also used by the Tier 2 superblock creation code to find +the function being called (and from there the code object). + +How does a function's `func_version` field get initialized? + +- `PyFunction_New` and friends initialize it to 0. +- The `MAKE_FUNCTION` instruction sets it from the code's `co_version`. +- It is reset to 0 when various attributes like `__code__` are set. +- A new version is allocated by `_PyFunction_GetVersionForCurrentState` + when the specializer needs a version and the version is 0. + +The latter allocates versions using a counter in the interpreter state; +when the counter wraps around to 0, no more versions are allocated. +There is one other special case: functions with a non-standard +`vectorcall` field are not given a version. + +When the function version is 0, the `CALL` bytecode is not specialized. + +Code object versions +-------------------- + +So where to code objects get their `co_version`? There is a single +static global counter, `_Py_next_func_version`. This is initialized in +the generated (!) file `Python/deepfreeze/deepfreeze.c`, to 1 plus the +number of deep-frozen function objects in that file. +(In `_bootstrap_python.c` and `freeze_module.c` it is initialized to 1.) + +Code objects get a new `co_version` allocated from this counter upon +creation. Since code objects are nominally immutable, `co_version` can +not be invalidated. The only way it can be 0 is when 2**32 or more +code objects have been created during the process's lifetime. +(The counter isn't reset by `fork()`, extending the lifetime.) +*/ + void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) { From ece96f4a22e19c9186c0247df332c136c1836587 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 14:49:36 -0700 Subject: [PATCH 24/34] Move function cache clearing earlier in finalization --- Python/pylifecycle.c | 5 +++++ Python/pystate.c | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0de3abf9407899..b3319e81c89e96 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1767,6 +1767,11 @@ Py_FinalizeEx(void) // XXX assert(_Py_IsMainInterpreter(tstate->interp)); // XXX assert(_Py_IsMainThread()); + // The function version cache keeps functions alive, so clear it. + // It's used for optimizations, which we don't need in this stage. + tstate->interp->func_state.next_version = 0; // No more new versions + _PyFunction_ClearByVersionCache(tstate->interp); + // Block some operations. tstate->interp->finalizing = 1; diff --git a/Python/pystate.c b/Python/pystate.c index 321abe7a6c633b..3a05cb0fa7988d 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -840,9 +840,6 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) */ // XXX Make sure we properly deal with problematic finalizers. - interp->func_state.next_version = 0; // No more new versions - _PyFunction_ClearByVersionCache(interp); - Py_CLEAR(interp->audit_hooks); for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { From b408b2ba15324d9f9bae25e87d3c20893a57e328 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 12 Aug 2023 18:56:14 -0700 Subject: [PATCH 25/34] Cache borrowed references, cleared in func_dealloc --- Include/internal/pycore_function.h | 8 ++++---- Objects/funcobject.c | 33 ++++++++++++++---------------- Python/abstract_interp_cases.c.h | 4 ++++ Python/pylifecycle.c | 5 ----- Python/sysmodule.c | 3 --- 5 files changed, 23 insertions(+), 30 deletions(-) diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index cb3f856410d0d1..3f3da8a44b77e4 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -19,9 +19,10 @@ extern PyObject* _PyFunction_Vectorcall( #define FUNC_VERSION_CACHE_SIZE (1<<12) /* Must be a power of 2 */ struct _py_func_state { uint32_t next_version; - // Function objects whose func_version % FUNC_VERSION_CACHE_SIZE - // once equaled the index in the table. The references are owned: - // Call _PyFunction_ClearByVersionCache() to clear. + // Borrowed references to function objects whose + // func_version % FUNC_VERSION_CACHE_SIZE + // once was equal to the index in the table. + // They are cleared when the function is deallocated. PyFunctionObject *func_version_cache[FUNC_VERSION_CACHE_SIZE]; }; @@ -30,7 +31,6 @@ extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr) extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); -void _PyFunction_ClearByVersionCache(PyInterpreterState *interp); extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 6a1d38da169163..33191d23f18230 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -271,10 +271,8 @@ _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) func->func_version = version; if (version != 0) { PyInterpreterState *interp = _PyInterpreterState_GET(); - PyFunctionObject **slot = - interp->func_state.func_version_cache - + (version % FUNC_VERSION_CACHE_SIZE); - Py_XSETREF(*slot, (PyFunctionObject *)Py_NewRef(func)); + interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE] = func; } } @@ -282,24 +280,14 @@ PyFunctionObject * _PyFunction_LookupByVersion(uint32_t version) { PyInterpreterState *interp = _PyInterpreterState_GET(); - PyFunctionObject **slot = - interp->func_state.func_version_cache - + (version % FUNC_VERSION_CACHE_SIZE); - if (*slot != NULL && (*slot)->func_version == version) { - return (PyFunctionObject *)Py_NewRef(*slot); + PyFunctionObject *func = interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE]; + if (func != NULL && func->func_version == version) { + return (PyFunctionObject *)Py_NewRef(func); } return NULL; } -void -_PyFunction_ClearByVersionCache(PyInterpreterState *interp) -{ - for (int i = 0; i < FUNC_VERSION_CACHE_SIZE; i++) { - PyFunctionObject **slot = interp->func_state.func_version_cache + i; - Py_CLEAR(*slot); - } -} - uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { @@ -929,6 +917,15 @@ func_dealloc(PyFunctionObject *op) if (op->func_weakreflist != NULL) { PyObject_ClearWeakRefs((PyObject *) op); } + if (op->func_version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (op->func_version % FUNC_VERSION_CACHE_SIZE); + if (*slot == op) { + *slot = NULL; + } + } (void)func_clear(op); // These aren't cleared by func_clear(). Py_DECREF(op->func_code); diff --git a/Python/abstract_interp_cases.c.h b/Python/abstract_interp_cases.c.h index ca3c274e87b691..accb9129b5301d 100644 --- a/Python/abstract_interp_cases.c.h +++ b/Python/abstract_interp_cases.c.h @@ -7,6 +7,10 @@ break; } + case RESUME: { + break; + } + case POP_TOP: { STACK_SHRINK(1); break; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index b3319e81c89e96..0de3abf9407899 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1767,11 +1767,6 @@ Py_FinalizeEx(void) // XXX assert(_Py_IsMainInterpreter(tstate->interp)); // XXX assert(_Py_IsMainThread()); - // The function version cache keeps functions alive, so clear it. - // It's used for optimizations, which we don't need in this stage. - tstate->interp->func_state.next_version = 0; // No more new versions - _PyFunction_ClearByVersionCache(tstate->interp); - // Block some operations. tstate->interp->finalizing = 1; diff --git a/Python/sysmodule.c b/Python/sysmodule.c index f8cb8e6d61a048..be026d95ba7e77 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2079,9 +2079,6 @@ sys__clear_type_cache_impl(PyObject *module) /*[clinic end generated code: output=20e48ca54a6f6971 input=127f3e04a8d9b555]*/ { PyType_ClearCache(); - // Also clear the function-by-version cache - PyInterpreterState *interp = _PyInterpreterState_GET(); - _PyFunction_ClearByVersionCache(interp); Py_RETURN_NONE; } From 00ae0fa83b743df799391055457b35785558da3e Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sun, 13 Aug 2023 11:45:42 -0700 Subject: [PATCH 26/34] Correctly set instr_fmt metadata for macros Macros not using oparg should have a format starting with IX, not IB. This corrects the metadata for a bunch of specializations, e.g. BINARY_OP_MULTIPLY_INT (which never looks at oparg). --- Include/internal/pycore_opcode_metadata.h | 20 ++++++++++---------- Tools/cases_generator/analysis.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index dec41fccca5fbd..05d617ba949d16 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1186,7 +1186,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [STORE_FAST_STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG }, [POP_TOP] = { true, INSTR_FMT_IX, 0 }, [PUSH_NULL] = { true, INSTR_FMT_IX, 0 }, - [END_FOR] = { true, INSTR_FMT_IB, 0 }, + [END_FOR] = { true, INSTR_FMT_IX, 0 }, [INSTRUMENTED_END_FOR] = { true, INSTR_FMT_IX, 0 }, [END_SEND] = { true, INSTR_FMT_IX, 0 }, [INSTRUMENTED_END_SEND] = { true, INSTR_FMT_IX, 0 }, @@ -1200,14 +1200,14 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [TO_BOOL_STR] = { true, INSTR_FMT_IXC00, 0 }, [TO_BOOL_ALWAYS_TRUE] = { true, INSTR_FMT_IXC00, 0 }, [UNARY_INVERT] = { true, INSTR_FMT_IX, 0 }, - [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IBC, 0 }, - [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IB, HAS_LOCAL_FLAG }, + [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC, 0 }, + [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IX, HAS_LOCAL_FLAG }, [BINARY_SUBSCR] = { true, INSTR_FMT_IXC, 0 }, [BINARY_SLICE] = { true, INSTR_FMT_IX, 0 }, [STORE_SLICE] = { true, INSTR_FMT_IX, 0 }, @@ -1254,7 +1254,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { [DELETE_ATTR] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [STORE_GLOBAL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [DELETE_GLOBAL] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, - [LOAD_LOCALS] = { true, INSTR_FMT_IB, 0 }, + [LOAD_LOCALS] = { true, INSTR_FMT_IX, 0 }, [LOAD_NAME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [LOAD_FROM_DICT_OR_GLOBALS] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_NAME_FLAG }, [LOAD_GLOBAL] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG }, diff --git a/Tools/cases_generator/analysis.py b/Tools/cases_generator/analysis.py index 2db1cd01c19ae5..593cabbe919740 100644 --- a/Tools/cases_generator/analysis.py +++ b/Tools/cases_generator/analysis.py @@ -367,7 +367,7 @@ def analyze_macro(self, macro: parsing.Macro) -> MacroInstruction: flags.add(instr.instr_flags) case _: typing.assert_never(component) - format = "IB" + format = "IB" if flags.HAS_ARG_FLAG else "IX" if offset: format += "C" + "0" * (offset - 1) return MacroInstruction(macro.name, format, flags, macro, parts, offset) From fe843d8d8ed16f0cc7f1832f93f2942e1e73e156 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sun, 13 Aug 2023 13:14:44 -0700 Subject: [PATCH 27/34] Split RETURN_{VALUE,CONST} into uops (mostly works) --- Include/internal/pycore_ceval.h | 1 + Include/internal/pycore_opcode_metadata.h | 64 ++++++++++++--------- Python/abstract_interp_cases.c.h | 5 ++ Python/bytecodes.c | 46 ++++++++------- Python/ceval.c | 8 +-- Python/executor_cases.c.h | 31 ++++++++++ Python/generated_cases.c.h | 70 +++++++++++++++++------ Python/optimizer.c | 6 ++ 8 files changed, 159 insertions(+), 72 deletions(-) diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 05b7380597812b..0e3a99be8c36aa 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -171,6 +171,7 @@ void _PyEval_FormatKwargsError(PyThreadState *tstate, PyObject *func, PyObject * PyObject *_PyEval_MatchClass(PyThreadState *tstate, PyObject *subject, PyObject *type, Py_ssize_t nargs, PyObject *kwargs); PyObject *_PyEval_MatchKeys(PyThreadState *tstate, PyObject *map, PyObject *keys); int _PyEval_UnpackIterable(PyThreadState *tstate, PyObject *v, int argcnt, int argcntafter, PyObject **sp); +void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); #ifdef __cplusplus diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 05d617ba949d16..6bffde242ef588 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -33,34 +33,35 @@ #define _BINARY_OP_SUBTRACT_FLOAT 309 #define _GUARD_BOTH_UNICODE 310 #define _BINARY_OP_ADD_UNICODE 311 -#define _LOAD_LOCALS 312 -#define _LOAD_FROM_DICT_OR_GLOBALS 313 -#define _GUARD_GLOBALS_VERSION 314 -#define _GUARD_BUILTINS_VERSION 315 -#define _LOAD_GLOBAL_MODULE 316 -#define _LOAD_GLOBAL_BUILTINS 317 -#define _GUARD_TYPE_VERSION 318 -#define _CHECK_MANAGED_OBJECT_HAS_VALUES 319 -#define _LOAD_ATTR_INSTANCE_VALUE 320 -#define IS_NONE 321 -#define _ITER_CHECK_LIST 322 -#define _IS_ITER_EXHAUSTED_LIST 323 -#define _ITER_NEXT_LIST 324 -#define _ITER_CHECK_TUPLE 325 -#define _IS_ITER_EXHAUSTED_TUPLE 326 -#define _ITER_NEXT_TUPLE 327 -#define _ITER_CHECK_RANGE 328 -#define _IS_ITER_EXHAUSTED_RANGE 329 -#define _ITER_NEXT_RANGE 330 -#define _CHECK_PEP_523 331 -#define _CHECK_FUNCTION_EXACT_ARGS 332 -#define _CHECK_STACK_SPACE 333 -#define _INIT_CALL_PY_EXACT_ARGS 334 -#define _PUSH_FRAME 335 -#define _POP_JUMP_IF_FALSE 336 -#define _POP_JUMP_IF_TRUE 337 -#define JUMP_TO_TOP 338 -#define INSERT 339 +#define _POP_FRAME 312 +#define _LOAD_LOCALS 313 +#define _LOAD_FROM_DICT_OR_GLOBALS 314 +#define _GUARD_GLOBALS_VERSION 315 +#define _GUARD_BUILTINS_VERSION 316 +#define _LOAD_GLOBAL_MODULE 317 +#define _LOAD_GLOBAL_BUILTINS 318 +#define _GUARD_TYPE_VERSION 319 +#define _CHECK_MANAGED_OBJECT_HAS_VALUES 320 +#define _LOAD_ATTR_INSTANCE_VALUE 321 +#define IS_NONE 322 +#define _ITER_CHECK_LIST 323 +#define _IS_ITER_EXHAUSTED_LIST 324 +#define _ITER_NEXT_LIST 325 +#define _ITER_CHECK_TUPLE 326 +#define _IS_ITER_EXHAUSTED_TUPLE 327 +#define _ITER_NEXT_TUPLE 328 +#define _ITER_CHECK_RANGE 329 +#define _IS_ITER_EXHAUSTED_RANGE 330 +#define _ITER_NEXT_RANGE 331 +#define _CHECK_PEP_523 332 +#define _CHECK_FUNCTION_EXACT_ARGS 333 +#define _CHECK_STACK_SPACE 334 +#define _INIT_CALL_PY_EXACT_ARGS 335 +#define _PUSH_FRAME 336 +#define _POP_JUMP_IF_FALSE 337 +#define _POP_JUMP_IF_TRUE 338 +#define JUMP_TO_TOP 339 +#define INSERT 340 extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); #ifdef NEED_OPCODE_METADATA @@ -196,6 +197,8 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return oparg; case INTERPRETER_EXIT: return 1; + case _POP_FRAME: + return 1; case RETURN_VALUE: return 1; case INSTRUMENTED_RETURN_VALUE: @@ -720,6 +723,8 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 0; case INTERPRETER_EXIT: return 0; + case _POP_FRAME: + return 0; case RETURN_VALUE: return 0; case INSTRUMENTED_RETURN_VALUE: @@ -1440,6 +1445,8 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [DELETE_SUBSCR] = { .nuops = 1, .uops = { { DELETE_SUBSCR, 0, 0 } } }, [CALL_INTRINSIC_1] = { .nuops = 1, .uops = { { CALL_INTRINSIC_1, 0, 0 } } }, [CALL_INTRINSIC_2] = { .nuops = 1, .uops = { { CALL_INTRINSIC_2, 0, 0 } } }, + [RETURN_VALUE] = { .nuops = 1, .uops = { { _POP_FRAME, 0, 0 } } }, + [RETURN_CONST] = { .nuops = 2, .uops = { { LOAD_CONST, 0, 0 }, { _POP_FRAME, 0, 0 } } }, [GET_AITER] = { .nuops = 1, .uops = { { GET_AITER, 0, 0 } } }, [GET_ANEXT] = { .nuops = 1, .uops = { { GET_ANEXT, 0, 0 } } }, [GET_AWAITABLE] = { .nuops = 1, .uops = { { GET_AWAITABLE, 0, 0 } } }, @@ -1541,6 +1548,7 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_BINARY_OP_SUBTRACT_FLOAT] = "_BINARY_OP_SUBTRACT_FLOAT", [_GUARD_BOTH_UNICODE] = "_GUARD_BOTH_UNICODE", [_BINARY_OP_ADD_UNICODE] = "_BINARY_OP_ADD_UNICODE", + [_POP_FRAME] = "_POP_FRAME", [_LOAD_LOCALS] = "_LOAD_LOCALS", [_LOAD_FROM_DICT_OR_GLOBALS] = "_LOAD_FROM_DICT_OR_GLOBALS", [_GUARD_GLOBALS_VERSION] = "_GUARD_GLOBALS_VERSION", diff --git a/Python/abstract_interp_cases.c.h b/Python/abstract_interp_cases.c.h index accb9129b5301d..8624d9c6c264eb 100644 --- a/Python/abstract_interp_cases.c.h +++ b/Python/abstract_interp_cases.c.h @@ -195,6 +195,11 @@ break; } + case _POP_FRAME: { + STACK_SHRINK(1); + break; + } + case GET_AITER: { PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1)), true); break; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 11da07378cdc37..223409ecbce9ce 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -764,21 +764,39 @@ dummy_func( return retval; } - inst(RETURN_VALUE, (retval --)) { - STACK_SHRINK(1); + // The stack effect here is ambiguous. + // We definitely pop the return value off the stack on entry. + // We also push it onto the stack on exit, but that's a + // different frame, and it's accounted for by _PUSH_FRAME. + op(_POP_FRAME, (retval --)) { assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); + SAVE_FRAME_STATE(); // Signals to the code generator + #if TIER_ONE _Py_LeaveRecursiveCallPy(tstate); assert(frame != &entry_frame); + #endif // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; + #if TIER_ONE frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = dying->previous; + #endif + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); + #if TIER_ONE goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } + macro(RETURN_VALUE) = _POP_FRAME; + inst(INSTRUMENTED_RETURN_VALUE, (retval --)) { int err = _Py_call_instrumentation_arg( tstate, PY_MONITORING_EVENT_PY_RETURN, @@ -792,27 +810,13 @@ dummy_func( // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; } - inst(RETURN_CONST, (--)) { - PyObject *retval = GETITEM(FRAME_CO_CONSTS, oparg); - Py_INCREF(retval); - assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); - _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - // GH-99729: We need to unlink the frame *before* clearing it: - _PyInterpreterFrame *dying = frame; - frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); - frame->prev_instr += frame->return_offset; - _PyFrame_StackPush(frame, retval); - goto resume_frame; - } + macro(RETURN_CONST) = LOAD_CONST + _POP_FRAME; inst(INSTRUMENTED_RETURN_CONST, (--)) { PyObject *retval = GETITEM(FRAME_CO_CONSTS, oparg); @@ -828,7 +832,7 @@ dummy_func( // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; diff --git a/Python/ceval.c b/Python/ceval.c index 26e741ed7c7547..3887bfdb76cd11 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -222,8 +222,6 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, PyFunctionObject *func, static _PyInterpreterFrame * _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, PyFunctionObject *func, PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs); -static void -_PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame); #ifdef HAVE_ERRNO_H #include @@ -925,7 +923,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->return_offset = 0; if (frame == &entry_frame) { /* Restore previous cframe and exit */ @@ -1495,8 +1493,8 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame) frame->previous = NULL; } -static void -_PyEvalFrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame) +void +_PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame) { if (frame->owner == FRAME_OWNED_BY_THREAD) { clear_thread_frame(tstate, frame); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 94138e9b459989..d7e826c4d1ffdf 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -684,6 +684,37 @@ break; } + case _POP_FRAME: { + PyObject *retval; + retval = stack_pointer[-1]; + STACK_SHRINK(1); + assert(EMPTY()); + SAVE_FRAME_STATE(); // Signals to the code generator + #if TIER_ONE + _Py_LeaveRecursiveCallPy(tstate); + assert(frame != &entry_frame); + #endif + // GH-99729: We need to unlink the frame *before* clearing it: + _PyInterpreterFrame *dying = frame; + #if TIER_ONE + frame = cframe.current_frame = dying->previous; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = dying->previous; + #endif + _PyEval_FrameClearAndPop(tstate, dying); + frame->prev_instr += frame->return_offset; + _PyFrame_StackPush(frame, retval); + #if TIER_ONE + goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + break; + } + case GET_AITER: { PyObject *obj; PyObject *iter; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 63a45212a17a0a..f2aae9c64761c6 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -978,17 +978,29 @@ retval = stack_pointer[-1]; STACK_SHRINK(1); assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); + SAVE_FRAME_STATE(); // Signals to the code generator + #if TIER_ONE _Py_LeaveRecursiveCallPy(tstate); assert(frame != &entry_frame); + #endif // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; + #if TIER_ONE frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = dying->previous; + #endif + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); + #if TIER_ONE goto resume_frame; - STACK_SHRINK(1); + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } TARGET(INSTRUMENTED_RETURN_VALUE) { @@ -1006,7 +1018,7 @@ // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; @@ -1014,19 +1026,41 @@ } TARGET(RETURN_CONST) { - PyObject *retval = GETITEM(FRAME_CO_CONSTS, oparg); - Py_INCREF(retval); - assert(EMPTY()); - _PyFrame_SetStackPointer(frame, stack_pointer); - _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - // GH-99729: We need to unlink the frame *before* clearing it: - _PyInterpreterFrame *dying = frame; - frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); - frame->prev_instr += frame->return_offset; - _PyFrame_StackPush(frame, retval); - goto resume_frame; + PyObject *value; + PyObject *retval; + // LOAD_CONST + { + value = GETITEM(FRAME_CO_CONSTS, oparg); + Py_INCREF(value); + } + // _POP_FRAME + retval = value; + { + assert(EMPTY()); + SAVE_FRAME_STATE(); // Signals to the code generator + #if TIER_ONE + _Py_LeaveRecursiveCallPy(tstate); + assert(frame != &entry_frame); + #endif + // GH-99729: We need to unlink the frame *before* clearing it: + _PyInterpreterFrame *dying = frame; + #if TIER_ONE + frame = cframe.current_frame = dying->previous; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = dying->previous; + #endif + _PyEval_FrameClearAndPop(tstate, dying); + frame->prev_instr += frame->return_offset; + _PyFrame_StackPush(frame, retval); + #if TIER_ONE + goto resume_frame; + #endif + #if TIER_TWO + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + } } TARGET(INSTRUMENTED_RETURN_CONST) { @@ -1043,7 +1077,7 @@ // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; frame = cframe.current_frame = dying->previous; - _PyEvalFrameClearAndPop(tstate, dying); + _PyEval_FrameClearAndPop(tstate, dying); frame->prev_instr += frame->return_offset; _PyFrame_StackPush(frame, retval); goto resume_frame; diff --git a/Python/optimizer.c b/Python/optimizer.c index dd30d53063b4c6..36bf035b0b1f4a 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -619,6 +619,12 @@ translate_bytecode_to_trace( expansion->uops[i].offset); Py_FatalError("garbled expansion"); } + if (expansion->uops[i].uop == _POP_FRAME) { + // TODO: Move this code *after* adding it to the trace + // TODO: Continue in the previous code object, if any + ADD_TO_TRACE(SAVE_IP, INSTR_IP(instr, code), 0); + goto done; + } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); if (expansion->uops[i].uop == _PUSH_FRAME) { assert(i + 1 == nuops); From 5bf745c62251ce2a6ebda20d891caab7a2878648 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sun, 13 Aug 2023 16:34:26 -0700 Subject: [PATCH 28/34] Change LLTRACE debug to trigger on PYTHONUOPSDEBUG >= 5, not 4 --- Python/ceval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index 3887bfdb76cd11..af557ba532e5de 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -736,7 +736,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // When tracing executed uops, also trace bytecode char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG"); if (uop_debug != NULL && *uop_debug >= '0') { - lltrace = (*uop_debug - '0') >= 4; // TODO: Parse an int and all that + lltrace = (*uop_debug - '0') >= 5; // TODO: Parse an int and all that } } } From 1e55726810fc78ce56f4f644f1cc2960c7b62a19 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sun, 13 Aug 2023 17:08:53 -0700 Subject: [PATCH 29/34] Handle _POP_FRAME in superblock properly --- Python/optimizer.c | 53 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 36bf035b0b1f4a..c678346d5b477f 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -373,6 +373,8 @@ static PyTypeObject UOpExecutor_Type = { .tp_as_sequence = &uop_as_sequence, }; +#define TRACE_STACK_SIZE 5 + static int translate_bytecode_to_trace( PyCodeObject *code, @@ -380,10 +382,16 @@ translate_bytecode_to_trace( _PyUOpInstruction *trace, int buffer_size) { + PyCodeObject *initial_code = code; _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; int max_length = buffer_size; int reserved = 0; + struct { + PyCodeObject *code; + _Py_CODEUNIT *instr; + } trace_stack[TRACE_STACK_SIZE]; + int trace_stack_depth = 0; #ifdef Py_DEBUG char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG"); @@ -441,6 +449,24 @@ translate_bytecode_to_trace( // Reserve space for main+stub uops, plus 2 for SAVE_IP and EXIT_TRACE #define RESERVE(main, stub) RESERVE_RAW((main) + (stub) + 2, uop_name(opcode)) +// Trace stack operations (used by _PUSH_FRAME, _POP_FRAME) +#define TRACE_STACK_PUSH() \ + if (trace_stack_depth >= TRACE_STACK_SIZE) { \ + DPRINTF(2, "Trace stack overflow\n"); \ + ADD_TO_TRACE(SAVE_IP, 0, 0); \ + goto done; \ + } \ + trace_stack[trace_stack_depth].code = code; \ + trace_stack[trace_stack_depth].instr = instr; \ + trace_stack_depth++; +#define TRACE_STACK_POP() \ + if (trace_stack_depth <= 0) { \ + Py_FatalError("Trace stack underflow\n"); \ + } \ + trace_stack_depth--; \ + code = trace_stack[trace_stack_depth].code; \ + instr = trace_stack[trace_stack_depth].instr; + DPRINTF(4, "Optimizing %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -509,7 +535,7 @@ translate_bytecode_to_trace( case JUMP_BACKWARD: { - if (instr + 2 - oparg == initial_instr) { + if (instr + 2 - oparg == initial_instr && code == initial_code) { RESERVE(1, 0); ADD_TO_TRACE(JUMP_TO_TOP, 0, 0); } @@ -619,13 +645,17 @@ translate_bytecode_to_trace( expansion->uops[i].offset); Py_FatalError("garbled expansion"); } + ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); if (expansion->uops[i].uop == _POP_FRAME) { - // TODO: Move this code *after* adding it to the trace - // TODO: Continue in the previous code object, if any - ADD_TO_TRACE(SAVE_IP, INSTR_IP(instr, code), 0); - goto done; + TRACE_STACK_POP(); + DPRINTF(2, + "Returning to %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(instr, code)); + goto top; } - ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); if (expansion->uops[i].uop == _PUSH_FRAME) { assert(i + 1 == nuops); int func_version_offset = @@ -646,14 +676,17 @@ translate_bytecode_to_trace( ADD_TO_TRACE(SAVE_IP, 0, 0); goto done; } + // Increment IP to the return address + instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1; + TRACE_STACK_PUSH(); code = new_code; - initial_instr = instr = _PyCode_CODE(code); + instr = _PyCode_CODE(code); DPRINTF(2, "Continuing in %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), PyUnicode_AsUTF8(code->co_filename), code->co_firstlineno, - 2 * INSTR_IP(initial_instr, code)); + 2 * INSTR_IP(instr, code)); goto top; } ADD_TO_TRACE(SAVE_IP, 0, 0); @@ -674,6 +707,10 @@ translate_bytecode_to_trace( } // End for (;;) done: + while (trace_stack_depth > 0) { + TRACE_STACK_POP(); + } + assert(code == initial_code); // Skip short traces like SAVE_IP, LOAD_FAST, SAVE_IP, EXIT_TRACE if (trace_length > 3) { ADD_TO_TRACE(EXIT_TRACE, 0, 0); From c632262a8a771464cf6394bb5e5253c3db77740c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sun, 13 Aug 2023 17:37:04 -0700 Subject: [PATCH 30/34] Handle trace stack underflow --- Python/optimizer.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Python/optimizer.c b/Python/optimizer.c index c678346d5b477f..f282c041a4c59f 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -600,6 +600,14 @@ translate_bytecode_to_trace( // Reserve space for nuops (+ SAVE_IP + EXIT_TRACE) int nuops = expansion->nuops; RESERVE(nuops, 0); + if (expansion->uops[nuops-1].uop == _POP_FRAME) { + // Check for trace stack underflow now: + // We can't bail e.g. in the middle of + // LOAD_CONST + _POP_FRAME. + if (trace_stack_depth == 0) { + DPRINTF(2, "Trace stack underflow\n"); + goto done;} + } uint32_t orig_oparg = oparg; // For OPARG_TOP/BOTTOM for (int i = 0; i < nuops; i++) { oparg = orig_oparg; From b59a3af9fe1ef14ac7081ebf7daa7ad14b96febe Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 15 Aug 2023 13:58:59 -0700 Subject: [PATCH 31/34] Add _Py_LeaveRecursiveCallPy to _POP_FRAME --- Python/bytecodes.c | 4 +--- Python/ceval.c | 4 ---- Python/ceval_macros.h | 4 ++++ Python/executor_cases.c.h | 4 +--- Python/generated_cases.c.h | 8 ++------ 5 files changed, 8 insertions(+), 16 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 223409ecbce9ce..439e7179604fd6 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -771,13 +771,11 @@ dummy_func( op(_POP_FRAME, (retval --)) { assert(EMPTY()); SAVE_FRAME_STATE(); // Signals to the code generator - #if TIER_ONE _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - #endif // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; #if TIER_ONE + assert(frame != &entry_frame); frame = cframe.current_frame = dying->previous; #endif #if TIER_TWO diff --git a/Python/ceval.c b/Python/ceval.c index af557ba532e5de..e2b1f808c6d217 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -601,10 +601,6 @@ int _Py_CheckRecursiveCallPy( } -static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { - tstate->py_recursion_remaining++; -} - static const _Py_CODEUNIT _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS[] = { /* Put a NOP at the start, so that the IP points into * the code, rather than before it */ diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 67afd5c12d5ce3..2eb9e05910463b 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -374,3 +374,7 @@ static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { return (tstate->py_recursion_remaining-- <= 0) && _Py_CheckRecursiveCallPy(tstate); } + +static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { + tstate->py_recursion_remaining++; +} diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d7e826c4d1ffdf..f5642e0f09170d 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -690,13 +690,11 @@ STACK_SHRINK(1); assert(EMPTY()); SAVE_FRAME_STATE(); // Signals to the code generator - #if TIER_ONE _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - #endif // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; #if TIER_ONE + assert(frame != &entry_frame); frame = cframe.current_frame = dying->previous; #endif #if TIER_TWO diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index f2aae9c64761c6..8139edaada7acb 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -979,13 +979,11 @@ STACK_SHRINK(1); assert(EMPTY()); SAVE_FRAME_STATE(); // Signals to the code generator - #if TIER_ONE _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - #endif // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; #if TIER_ONE + assert(frame != &entry_frame); frame = cframe.current_frame = dying->previous; #endif #if TIER_TWO @@ -1038,13 +1036,11 @@ { assert(EMPTY()); SAVE_FRAME_STATE(); // Signals to the code generator - #if TIER_ONE _Py_LeaveRecursiveCallPy(tstate); - assert(frame != &entry_frame); - #endif // GH-99729: We need to unlink the frame *before* clearing it: _PyInterpreterFrame *dying = frame; #if TIER_ONE + assert(frame != &entry_frame); frame = cframe.current_frame = dying->previous; #endif #if TIER_TWO From 158e27bf91ed0c15a9e48f4ecd71f09844cc7fa8 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 15 Aug 2023 16:56:12 -0700 Subject: [PATCH 32/34] Ensure co_stacksize >= 1, else RETURN_CONST may crash --- Objects/codeobject.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 6987a2382d81c2..17710c23ef8278 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -395,6 +395,9 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) int nlocals, ncellvars, nfreevars; get_localsplus_counts(con->localsplusnames, con->localspluskinds, &nlocals, &ncellvars, &nfreevars); + if (con->stacksize == 0) { + con->stacksize = 1; + } co->co_filename = Py_NewRef(con->filename); co->co_name = Py_NewRef(con->name); From 8d9515756d37d975ae4a74016ab2e404dc668016 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 15 Aug 2023 17:52:06 -0700 Subject: [PATCH 33/34] Fix failing test_code (co_stacksize is never 0) --- Lib/test/test_code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py index ca06a39f5df142..e056c16466e8c4 100644 --- a/Lib/test/test_code.py +++ b/Lib/test/test_code.py @@ -264,7 +264,7 @@ def func2(): ("co_posonlyargcount", 0), ("co_kwonlyargcount", 0), ("co_nlocals", 1), - ("co_stacksize", 0), + ("co_stacksize", 1), ("co_flags", code.co_flags | inspect.CO_COROUTINE), ("co_firstlineno", 100), ("co_code", code2.co_code), From d90ecaf03806f0ccaa7f6dc3f5ce0e11b2c90817 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 15 Aug 2023 18:23:05 -0700 Subject: [PATCH 34/34] Don't trace into functions if func_version != co_version Seems to fix test_opcache, playing games with func.__code__ = func.__code__.replace(). --- Python/optimizer.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Python/optimizer.c b/Python/optimizer.c index f282c041a4c59f..57518404c3f19d 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -684,6 +684,14 @@ translate_bytecode_to_trace( ADD_TO_TRACE(SAVE_IP, 0, 0); goto done; } + if (new_code->co_version != func_version) { + // func.__code__ was updated. + // Perhaps it may happen again, so don't bother tracing. + // TODO: Reason about this -- is it better to bail or not? + DPRINTF(2, "Bailing because co_version != func_version\n"); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } // Increment IP to the return address instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1; TRACE_STACK_PUSH();