Skip to content

GH-118093: Handle some polymorphism before requiring progress in tier two #122843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 12, 2024
15 changes: 11 additions & 4 deletions Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ typedef struct {
typedef struct {
uint8_t opcode;
uint8_t oparg;
uint8_t valid;
uint8_t linked;
uint16_t valid:1;
uint16_t linked:1;
uint16_t chain_depth:14; // Must be big engough for MAX_CHAIN_DEPTH - 1.
int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below).
_PyBloomFilter bloom;
_PyExecutorLinkListNode links;
Expand Down Expand Up @@ -83,7 +84,7 @@ typedef struct _PyOptimizerObject _PyOptimizerObject;
typedef int (*_Py_optimize_func)(
_PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
_Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
int curr_stackentries);
int curr_stackentries, bool progress_needed);

struct _PyOptimizerObject {
PyObject_HEAD
Expand Down Expand Up @@ -182,6 +183,12 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)

// The maximum number of side exits that we can take before requiring forward
// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this
// is the "maximum amount of polymorphism" that an isolated trace tree can
// handle before rejoining the rest of the program.
#define MAX_CHAIN_DEPTH 4

typedef struct _Py_UopsSymbol _Py_UopsSymbol;

struct _Py_UOpsAbstractFrame {
Expand Down Expand Up @@ -257,7 +264,7 @@ extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx);

PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);

PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr);
PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, int chain_depth);

static inline int is_terminator(const _PyUOpInstruction *uop)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve the experimental JIT's handling of polymorphic code.
7 changes: 4 additions & 3 deletions Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -2508,7 +2508,7 @@ dummy_func(
start--;
}
_PyExecutorObject *executor;
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor);
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0);
ERROR_IF(optimized < 0, error);
if (optimized) {
assert(tstate->previous_executor == NULL);
Expand Down Expand Up @@ -4547,7 +4547,8 @@ dummy_func(
Py_INCREF(executor);
}
else {
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
int chain_depth = current_executor->vm_data.chain_depth + 1;
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth);
if (optimized <= 0) {
exit->temperature = restart_backoff_counter(temperature);
if (optimized < 0) {
Expand Down Expand Up @@ -4630,7 +4631,7 @@ dummy_func(
exit->temperature = advance_backoff_counter(exit->temperature);
GOTO_TIER_ONE(target);
}
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0);
if (optimized <= 0) {
exit->temperature = restart_backoff_counter(exit->temperature);
if (optimized < 0) {
Expand Down
5 changes: 3 additions & 2 deletions Python/executor_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Python/generated_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

85 changes: 53 additions & 32 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ never_optimize(
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec,
int Py_UNUSED(stack_entries))
int Py_UNUSED(stack_entries),
bool Py_UNUSED(progress_needed))
{
// This may be called if the optimizer is reset
return 0;
Expand Down Expand Up @@ -176,32 +177,44 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
int
_PyOptimizer_Optimize(
_PyInterpreterFrame *frame, _Py_CODEUNIT *start,
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr)
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth)
{
// The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
// make progress in order to avoid infinite loops or excessively-long
// side-exit chains. We can only insert the executor into the bytecode if
// this is true, since a deopt won't infinitely re-enter the executor:
chain_depth %= MAX_CHAIN_DEPTH;
bool progress_needed = chain_depth == 0;
PyCodeObject *code = _PyFrame_GetCode(frame);
assert(PyCode_Check(code));
PyInterpreterState *interp = _PyInterpreterState_GET();
if (!has_space_for_executor(code, start)) {
if (progress_needed && !has_space_for_executor(code, start)) {
return 0;
}
_PyOptimizerObject *opt = interp->optimizer;
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)), progress_needed);
if (err <= 0) {
return err;
}
assert(*executor_ptr != NULL);
int index = get_index_for_executor(code, start);
if (index < 0) {
/* Out of memory. Don't raise and assume that the
* error will show up elsewhere.
*
* If an optimizer has already produced an executor,
* it might get confused by the executor disappearing,
* but there is not much we can do about that here. */
Py_DECREF(*executor_ptr);
return 0;
if (progress_needed) {
int index = get_index_for_executor(code, start);
if (index < 0) {
/* Out of memory. Don't raise and assume that the
* error will show up elsewhere.
*
* If an optimizer has already produced an executor,
* it might get confused by the executor disappearing,
* but there is not much we can do about that here. */
Py_DECREF(*executor_ptr);
return 0;
}
insert_executor(code, start, index, *executor_ptr);
}
insert_executor(code, start, index, *executor_ptr);
else {
(*executor_ptr)->vm_data.code = NULL;
}
(*executor_ptr)->vm_data.chain_depth = chain_depth;
assert((*executor_ptr)->vm_data.valid);
return 1;
}
Expand Down Expand Up @@ -530,9 +543,9 @@ translate_bytecode_to_trace(
_Py_CODEUNIT *instr,
_PyUOpInstruction *trace,
int buffer_size,
_PyBloomFilter *dependencies)
_PyBloomFilter *dependencies, bool progress_needed)
{
bool progress_needed = true;
bool first = true;
PyCodeObject *code = _PyFrame_GetCode(frame);
PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
assert(PyFunction_Check(func));
Expand Down Expand Up @@ -576,7 +589,7 @@ translate_bytecode_to_trace(
uint32_t opcode = instr->op.code;
uint32_t oparg = instr->op.arg;

if (!progress_needed && instr == initial_instr) {
if (!first && instr == initial_instr) {
// We have looped around to the start:
RESERVE(1);
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
Expand All @@ -585,14 +598,6 @@ translate_bytecode_to_trace(

DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);

if (opcode == ENTER_EXECUTOR) {
assert(oparg < 256);
_PyExecutorObject *executor = code->co_executors->executors[oparg];
opcode = executor->vm_data.opcode;
DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]);
oparg = executor->vm_data.oparg;
}

if (opcode == EXTENDED_ARG) {
instr++;
opcode = instr->op.code;
Expand All @@ -602,13 +607,27 @@ translate_bytecode_to_trace(
goto done;
}
}
if (opcode == ENTER_EXECUTOR) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we are done?
If we are wanting specialized traces, they may overlap.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added a comment justifying the decision.

// We have a couple of options here. We *could* peek "underneath"
// this executor and continue tracing, which could give us a longer,
// more optimizeable trace (at the expense of lots of duplicated
// tier two code). Instead, we choose to just end here and stitch to
// the other trace, which allows a side-exit traces to rejoin the
// "main" trace periodically (and also helps protect us against
// pathological behavior where the amount of tier two code explodes
// for a medium-length, branchy code path). This seems to work
// better in practice, but in the future we could be smarter about
// what we do here:
goto done;
}
assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);

/* Special case the first instruction,
* so that we can guarantee forward progress */
if (progress_needed) {
if (first && progress_needed) {
assert(first);
if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
opcode = _PyOpcode_Deopt[opcode];
}
Expand Down Expand Up @@ -903,7 +922,7 @@ translate_bytecode_to_trace(
}
top:
// Jump here after _PUSH_FRAME or likely branches.
progress_needed = false;
first = false;
} // End for (;;)

done:
Expand All @@ -912,7 +931,7 @@ translate_bytecode_to_trace(
}
assert(code == initial_code);
// Skip short traces where we can't even translate a single instruction:
if (progress_needed) {
if (first) {
OPT_STAT_INC(trace_too_short);
DPRINTF(2,
"No trace for %s (%s:%d) at byte offset %d (no progress)\n",
Expand Down Expand Up @@ -1225,13 +1244,14 @@ uop_optimize(
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr,
int curr_stackentries)
int curr_stackentries,
bool progress_needed)
{
_PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies);
_PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
OPT_STAT_INC(attempts);
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies);
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies, progress_needed);
if (length <= 0) {
// Error or nothing translated
return length;
Expand Down Expand Up @@ -1328,7 +1348,8 @@ counter_optimize(
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr,
int Py_UNUSED(curr_stackentries)
int Py_UNUSED(curr_stackentries),
bool Py_UNUSED(progress_needed)
)
{
PyCodeObject *code = _PyFrame_GetCode(frame);
Expand Down
Loading