gh-111786: Use separate opcode vars for Tier 1 and Tier 2

mdboom · mdboom · commit 00a506a22e93 · 2023-11-20T15:16:19.000-05:00
Suggested by @neonene: #111786 (comment) This makes Windows about 3% faster on pyperformance benchmarks.
diff --git a/Python/ceval.c b/Python/ceval.c
@@ -678,7 +678,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
 #ifdef Py_STATS
     int lastopcode = 0;
 #endif
-    uint16_t opcode;       /* Current opcode */
+    uint8_t opcode;       /* Current opcode */
     int oparg;            /* Current opcode argument, if any */
 #ifdef LLTRACE
     int lltrace = 0;
@@ -765,9 +765,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
     /* Start instructions */
 #if !USE_COMPUTED_GOTOS
     dispatch_opcode:
-        // Cast to an 8-bit value to improve the code generated by MSVC
-        // (in combination with the EXTRA_CASES macro).
-        switch ((uint8_t)opcode)
+        switch (opcode)
 #endif
         {
 
@@ -983,30 +981,31 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
 
     OPT_STAT_INC(traces_executed);
     _PyUOpInstruction *next_uop = current_executor->trace;
+    uint16_t uopcode;
     uint64_t operand;
 #ifdef Py_STATS
     uint64_t trace_uop_execution_counter = 0;
 #endif
 
     for (;;) {
-        opcode = next_uop->opcode;
+        uopcode = next_uop->opcode;
         oparg = next_uop->oparg;
         operand = next_uop->operand;
         DPRINTF(3,
                 "%4d: uop %s, oparg %d, operand %" PRIu64 ", stack_level %d\n",
                 (int)(next_uop - current_executor->trace),
-                opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode],
+                uopcode < 256 ? _PyOpcode_OpName[uopcode] : _PyOpcode_uop_name[uopcode],
                 oparg,
                 operand,
                 (int)(stack_pointer - _PyFrame_Stackbase(frame)));
         next_uop++;
         OPT_STAT_INC(uops_executed);
-        UOP_STAT_INC(opcode, execution_count);
+        UOP_STAT_INC(uopcode, execution_count);
 #ifdef Py_STATS
         trace_uop_execution_counter++;
 #endif
 
-        switch (opcode) {
+        switch (uopcode) {
 
 #include "executor_cases.c.h"
 
@@ -1042,7 +1041,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
 pop_1_error_tier_two:
     STACK_SHRINK(1);
 error_tier_two:
-    DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
+    DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", uopcode, operand);
     OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
     frame->return_offset = 0;  // Don't leave this random
     _PyFrame_SetStackPointer(frame, stack_pointer);
@@ -1053,9 +1052,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
 deoptimize:
     // On DEOPT_IF we just repeat the last instruction.
     // This presumes nothing was popped from the stack (nor pushed).
-    DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 " @ %d]\n", opcode, operand, (int)(next_uop-current_executor->trace-1));
+    DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 " @ %d]\n", uopcode, operand, (int)(next_uop-current_executor->trace-1));
     OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
-    UOP_STAT_INC(opcode, miss);
+    UOP_STAT_INC(uopcode, miss);
     frame->return_offset = 0;  // Dispatch to frame->instr_ptr
     _PyFrame_SetStackPointer(frame, stack_pointer);
     frame->instr_ptr = next_uop[-1].target + _PyCode_CODE((PyCodeObject *)frame->f_executable);