From 548fb9452b04c4a4bcd9adf9b218e4bdb476c581 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Wed, 20 Mar 2024 22:10:57 -0700 Subject: [PATCH 01/12] gh-116168: Remove extra _CHECK_STACK_SPACE uops --- Include/internal/pycore_uop_ids.h | 209 ++++++++-------- Include/internal/pycore_uop_metadata.h | 2 + Lib/test/test_capi/test_opt.py | 333 ++++++++++++++++++++++++- Modules/_testinternalcapi.c | 11 + Python/bytecodes.c | 6 + Python/executor_cases.c.h | 8 + Python/optimizer.c | 44 +++- Python/optimizer_analysis.c | 83 ++++++ Python/optimizer_cases.c.h | 4 + 9 files changed, 585 insertions(+), 115 deletions(-) diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index b569b80c5f110a..200d1f0a0f4e36 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -68,14 +68,15 @@ extern "C" { #define _CHECK_PEP_523 330 #define _CHECK_PERIODIC 331 #define _CHECK_STACK_SPACE 332 -#define _CHECK_VALIDITY 333 -#define _CHECK_VALIDITY_AND_SET_IP 334 -#define _COLD_EXIT 335 -#define _COMPARE_OP 336 -#define _COMPARE_OP_FLOAT 337 -#define _COMPARE_OP_INT 338 -#define _COMPARE_OP_STR 339 -#define _CONTAINS_OP 340 +#define _CHECK_STACK_SPACE_OPERAND 333 +#define _CHECK_VALIDITY 334 +#define _CHECK_VALIDITY_AND_SET_IP 335 +#define _COLD_EXIT 336 +#define _COMPARE_OP 337 +#define _COMPARE_OP_FLOAT 338 +#define _COMPARE_OP_INT 339 +#define _COMPARE_OP_STR 340 +#define _CONTAINS_OP 341 #define _CONTAINS_OP_DICT CONTAINS_OP_DICT #define _CONTAINS_OP_SET CONTAINS_OP_SET #define _CONVERT_VALUE CONVERT_VALUE @@ -91,41 +92,41 @@ extern "C" { #define _DICT_UPDATE DICT_UPDATE #define _END_SEND END_SEND #define _EXIT_INIT_CHECK EXIT_INIT_CHECK -#define _FATAL_ERROR 341 +#define _FATAL_ERROR 342 #define _FORMAT_SIMPLE FORMAT_SIMPLE #define _FORMAT_WITH_SPEC FORMAT_WITH_SPEC -#define _FOR_ITER 342 +#define _FOR_ITER 343 #define _FOR_ITER_GEN FOR_ITER_GEN -#define _FOR_ITER_TIER_TWO 343 +#define _FOR_ITER_TIER_TWO 344 #define _GET_AITER GET_AITER #define _GET_ANEXT GET_ANEXT #define _GET_AWAITABLE GET_AWAITABLE #define _GET_ITER GET_ITER #define _GET_LEN GET_LEN #define _GET_YIELD_FROM_ITER GET_YIELD_FROM_ITER -#define _GUARD_BOTH_FLOAT 344 -#define _GUARD_BOTH_INT 345 -#define _GUARD_BOTH_UNICODE 346 -#define _GUARD_BUILTINS_VERSION 347 -#define _GUARD_DORV_VALUES 348 -#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 349 -#define _GUARD_GLOBALS_VERSION 350 -#define _GUARD_IS_FALSE_POP 351 -#define _GUARD_IS_NONE_POP 352 -#define _GUARD_IS_NOT_NONE_POP 353 -#define _GUARD_IS_TRUE_POP 354 -#define _GUARD_KEYS_VERSION 355 -#define _GUARD_NOT_EXHAUSTED_LIST 356 -#define _GUARD_NOT_EXHAUSTED_RANGE 357 -#define _GUARD_NOT_EXHAUSTED_TUPLE 358 -#define _GUARD_TYPE_VERSION 359 -#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 360 -#define _INIT_CALL_PY_EXACT_ARGS 361 -#define _INIT_CALL_PY_EXACT_ARGS_0 362 -#define _INIT_CALL_PY_EXACT_ARGS_1 363 -#define _INIT_CALL_PY_EXACT_ARGS_2 364 -#define _INIT_CALL_PY_EXACT_ARGS_3 365 -#define _INIT_CALL_PY_EXACT_ARGS_4 366 +#define _GUARD_BOTH_FLOAT 345 +#define _GUARD_BOTH_INT 346 +#define _GUARD_BOTH_UNICODE 347 +#define _GUARD_BUILTINS_VERSION 348 +#define _GUARD_DORV_VALUES 349 +#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 350 +#define _GUARD_GLOBALS_VERSION 351 +#define _GUARD_IS_FALSE_POP 352 +#define _GUARD_IS_NONE_POP 353 +#define _GUARD_IS_NOT_NONE_POP 354 +#define _GUARD_IS_TRUE_POP 355 +#define _GUARD_KEYS_VERSION 356 +#define _GUARD_NOT_EXHAUSTED_LIST 357 +#define _GUARD_NOT_EXHAUSTED_RANGE 358 +#define _GUARD_NOT_EXHAUSTED_TUPLE 359 +#define _GUARD_TYPE_VERSION 360 +#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS 361 +#define _INIT_CALL_PY_EXACT_ARGS 362 +#define _INIT_CALL_PY_EXACT_ARGS_0 363 +#define _INIT_CALL_PY_EXACT_ARGS_1 364 +#define _INIT_CALL_PY_EXACT_ARGS_2 365 +#define _INIT_CALL_PY_EXACT_ARGS_3 366 +#define _INIT_CALL_PY_EXACT_ARGS_4 367 #define _INSTRUMENTED_CALL INSTRUMENTED_CALL #define _INSTRUMENTED_CALL_FUNCTION_EX INSTRUMENTED_CALL_FUNCTION_EX #define _INSTRUMENTED_CALL_KW INSTRUMENTED_CALL_KW @@ -142,65 +143,65 @@ extern "C" { #define _INSTRUMENTED_RETURN_CONST INSTRUMENTED_RETURN_CONST #define _INSTRUMENTED_RETURN_VALUE INSTRUMENTED_RETURN_VALUE #define _INSTRUMENTED_YIELD_VALUE INSTRUMENTED_YIELD_VALUE -#define _INTERNAL_INCREMENT_OPT_COUNTER 367 -#define _IS_NONE 368 +#define _INTERNAL_INCREMENT_OPT_COUNTER 368 +#define _IS_NONE 369 #define _IS_OP IS_OP -#define _ITER_CHECK_LIST 369 -#define _ITER_CHECK_RANGE 370 -#define _ITER_CHECK_TUPLE 371 -#define _ITER_JUMP_LIST 372 -#define _ITER_JUMP_RANGE 373 -#define _ITER_JUMP_TUPLE 374 -#define _ITER_NEXT_LIST 375 -#define _ITER_NEXT_RANGE 376 -#define _ITER_NEXT_TUPLE 377 -#define _JUMP_TO_TOP 378 +#define _ITER_CHECK_LIST 370 +#define _ITER_CHECK_RANGE 371 +#define _ITER_CHECK_TUPLE 372 +#define _ITER_JUMP_LIST 373 +#define _ITER_JUMP_RANGE 374 +#define _ITER_JUMP_TUPLE 375 +#define _ITER_NEXT_LIST 376 +#define _ITER_NEXT_RANGE 377 +#define _ITER_NEXT_TUPLE 378 +#define _JUMP_TO_TOP 379 #define _LIST_APPEND LIST_APPEND #define _LIST_EXTEND LIST_EXTEND #define _LOAD_ASSERTION_ERROR LOAD_ASSERTION_ERROR -#define _LOAD_ATTR 379 -#define _LOAD_ATTR_CLASS 380 -#define _LOAD_ATTR_CLASS_0 381 -#define _LOAD_ATTR_CLASS_1 382 +#define _LOAD_ATTR 380 +#define _LOAD_ATTR_CLASS 381 +#define _LOAD_ATTR_CLASS_0 382 +#define _LOAD_ATTR_CLASS_1 383 #define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN -#define _LOAD_ATTR_INSTANCE_VALUE 383 -#define _LOAD_ATTR_INSTANCE_VALUE_0 384 -#define _LOAD_ATTR_INSTANCE_VALUE_1 385 -#define _LOAD_ATTR_METHOD_LAZY_DICT 386 -#define _LOAD_ATTR_METHOD_NO_DICT 387 -#define _LOAD_ATTR_METHOD_WITH_VALUES 388 -#define _LOAD_ATTR_MODULE 389 -#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 390 -#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 391 +#define _LOAD_ATTR_INSTANCE_VALUE 384 +#define _LOAD_ATTR_INSTANCE_VALUE_0 385 +#define _LOAD_ATTR_INSTANCE_VALUE_1 386 +#define _LOAD_ATTR_METHOD_LAZY_DICT 387 +#define _LOAD_ATTR_METHOD_NO_DICT 388 +#define _LOAD_ATTR_METHOD_WITH_VALUES 389 +#define _LOAD_ATTR_MODULE 390 +#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT 391 +#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES 392 #define _LOAD_ATTR_PROPERTY LOAD_ATTR_PROPERTY -#define _LOAD_ATTR_SLOT 392 -#define _LOAD_ATTR_SLOT_0 393 -#define _LOAD_ATTR_SLOT_1 394 -#define _LOAD_ATTR_WITH_HINT 395 +#define _LOAD_ATTR_SLOT 393 +#define _LOAD_ATTR_SLOT_0 394 +#define _LOAD_ATTR_SLOT_1 395 +#define _LOAD_ATTR_WITH_HINT 396 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS #define _LOAD_CONST LOAD_CONST -#define _LOAD_CONST_INLINE 396 -#define _LOAD_CONST_INLINE_BORROW 397 -#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 398 -#define _LOAD_CONST_INLINE_WITH_NULL 399 +#define _LOAD_CONST_INLINE 397 +#define _LOAD_CONST_INLINE_BORROW 398 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 399 +#define _LOAD_CONST_INLINE_WITH_NULL 400 #define _LOAD_DEREF LOAD_DEREF -#define _LOAD_FAST 400 -#define _LOAD_FAST_0 401 -#define _LOAD_FAST_1 402 -#define _LOAD_FAST_2 403 -#define _LOAD_FAST_3 404 -#define _LOAD_FAST_4 405 -#define _LOAD_FAST_5 406 -#define _LOAD_FAST_6 407 -#define _LOAD_FAST_7 408 +#define _LOAD_FAST 401 +#define _LOAD_FAST_0 402 +#define _LOAD_FAST_1 403 +#define _LOAD_FAST_2 404 +#define _LOAD_FAST_3 405 +#define _LOAD_FAST_4 406 +#define _LOAD_FAST_5 407 +#define _LOAD_FAST_6 408 +#define _LOAD_FAST_7 409 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR #define _LOAD_FAST_CHECK LOAD_FAST_CHECK #define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS -#define _LOAD_GLOBAL 409 -#define _LOAD_GLOBAL_BUILTINS 410 -#define _LOAD_GLOBAL_MODULE 411 +#define _LOAD_GLOBAL 410 +#define _LOAD_GLOBAL_BUILTINS 411 +#define _LOAD_GLOBAL_MODULE 412 #define _LOAD_LOCALS LOAD_LOCALS #define _LOAD_NAME LOAD_NAME #define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR @@ -214,48 +215,48 @@ extern "C" { #define _MATCH_SEQUENCE MATCH_SEQUENCE #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_FRAME 412 -#define _POP_JUMP_IF_FALSE 413 -#define _POP_JUMP_IF_TRUE 414 +#define _POP_FRAME 413 +#define _POP_JUMP_IF_FALSE 414 +#define _POP_JUMP_IF_TRUE 415 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 415 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 416 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 416 +#define _PUSH_FRAME 417 #define _PUSH_NULL PUSH_NULL -#define _REPLACE_WITH_TRUE 417 +#define _REPLACE_WITH_TRUE 418 #define _RESUME_CHECK RESUME_CHECK -#define _SAVE_RETURN_OFFSET 418 -#define _SEND 419 +#define _SAVE_RETURN_OFFSET 419 +#define _SEND 420 #define _SEND_GEN SEND_GEN #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 420 -#define _STORE_ATTR 421 -#define _STORE_ATTR_INSTANCE_VALUE 422 -#define _STORE_ATTR_SLOT 423 +#define _START_EXECUTOR 421 +#define _STORE_ATTR 422 +#define _STORE_ATTR_INSTANCE_VALUE 423 +#define _STORE_ATTR_SLOT 424 #define _STORE_ATTR_WITH_HINT STORE_ATTR_WITH_HINT #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 424 -#define _STORE_FAST_0 425 -#define _STORE_FAST_1 426 -#define _STORE_FAST_2 427 -#define _STORE_FAST_3 428 -#define _STORE_FAST_4 429 -#define _STORE_FAST_5 430 -#define _STORE_FAST_6 431 -#define _STORE_FAST_7 432 +#define _STORE_FAST 425 +#define _STORE_FAST_0 426 +#define _STORE_FAST_1 427 +#define _STORE_FAST_2 428 +#define _STORE_FAST_3 429 +#define _STORE_FAST_4 430 +#define _STORE_FAST_5 431 +#define _STORE_FAST_6 432 +#define _STORE_FAST_7 433 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME #define _STORE_SLICE STORE_SLICE -#define _STORE_SUBSCR 433 +#define _STORE_SUBSCR 434 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TO_BOOL 434 +#define _TO_BOOL 435 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -265,12 +266,12 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 435 +#define _UNPACK_SEQUENCE 436 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START -#define MAX_UOP_ID 435 +#define MAX_UOP_ID 436 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 507bd27c01c553..9e560252ac65eb 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -230,6 +230,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_GUARD_IS_NOT_NONE_POP] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, [_JUMP_TO_TOP] = HAS_EVAL_BREAK_FLAG, [_SET_IP] = 0, + [_CHECK_STACK_SPACE_OPERAND] = HAS_DEOPT_FLAG, [_SAVE_RETURN_OFFSET] = HAS_ARG_FLAG, [_EXIT_TRACE] = HAS_DEOPT_FLAG | HAS_EXIT_FLAG, [_CHECK_VALIDITY] = HAS_DEOPT_FLAG, @@ -304,6 +305,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_CHECK_PEP_523] = "_CHECK_PEP_523", [_CHECK_PERIODIC] = "_CHECK_PERIODIC", [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", + [_CHECK_STACK_SPACE_OPERAND] = "_CHECK_STACK_SPACE_OPERAND", [_CHECK_VALIDITY] = "_CHECK_VALIDITY", [_CHECK_VALIDITY_AND_SET_IP] = "_CHECK_VALIDITY_AND_SET_IP", [_COLD_EXIT] = "_COLD_EXIT", diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index b59f4b74a8593e..d7cc7ee407a01c 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -4,6 +4,7 @@ import unittest import gc import os +import platform import _opcode import _testinternalcapi @@ -578,6 +579,13 @@ def testfunc(n): @requires_specialization @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") class TestUopsOptimization(unittest.TestCase): + FRAMESIZE_ADJUSTMENT = 1 if platform.architecture()[0] == "32bit" else 0 + + def _assert_framesize_cross_platform(self, fn_obj, expected_framesize): + self.assertEqual( + _testinternalcapi.get_co_framesize(fn_obj.__code__), + expected_framesize + self.FRAMESIZE_ADJUSTMENT + ) def _run_with_optimizer(self, testfunc, arg): res = None @@ -952,6 +960,299 @@ def testfunc(n): _, ex = self._run_with_optimizer(testfunc, 16) self.assertIsNone(ex) + def test_combine_stack_space_checks_sequential(self): + def dummy12(x): + return x - 1 + def dummy13(y): + z = y + 2 + return y, z + def testfunc(n): + a = 0 + for _ in range(n): + b = dummy12(7) + c, d = dummy13(9) + a += b + c + d + return a + + self._assert_framesize_cross_platform((dummy12), 12) + self._assert_framesize_cross_platform((dummy13), 13) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 832) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) + self.assertEqual(uop_names.count("_POP_FRAME"), 2) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + # sequential calls: max(12, 13) == 13 + largest_stack = _testinternalcapi.get_co_framesize(dummy13.__code__) + self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) + + def test_combine_stack_space_checks_nested(self): + def dummy12(x): + return x + 3 + def dummy15(y): + z = dummy12(y) + return y, z + def testfunc(n): + a = 0 + for _ in range(n): + b, c = dummy15(2) + a += b + c + return a + + self._assert_framesize_cross_platform((dummy12), 12) + self._assert_framesize_cross_platform((dummy15), 15) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 224) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) + self.assertEqual(uop_names.count("_POP_FRAME"), 2) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + # nested calls: 15 + 12 == 27 + largest_stack = ( + _testinternalcapi.get_co_framesize(dummy15.__code__) + + _testinternalcapi.get_co_framesize(dummy12.__code__) + ) + self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) + + def test_combine_stack_space_checks_several_calls(self): + def dummy12(x): + return x + 3 + def dummy13(y): + z = y + 2 + return y, z + def dummy18(y): + z = dummy12(y) + x, w = dummy13(z) + return z, x, w + def testfunc(n): + a = 0 + for _ in range(n): + b = dummy12(5) + c, d, e = dummy18(2) + a += b + c + d + e + return a + + self._assert_framesize_cross_platform((dummy12), 12) + self._assert_framesize_cross_platform((dummy13), 13) + self._assert_framesize_cross_platform((dummy18), 18) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 800) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 4) + self.assertEqual(uop_names.count("_POP_FRAME"), 4) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + # max(12, 18 + max(12, 13)) == 31 + largest_stack = ( + _testinternalcapi.get_co_framesize(dummy18.__code__) + + _testinternalcapi.get_co_framesize(dummy13.__code__) + ) + self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) + + def test_combine_stack_space_checks_several_calls_different_order(self): + # same as `several_calls` but with top-level calls reversed + def dummy12(x): + return x + 3 + def dummy13(y): + z = y + 2 + return y, z + def dummy18(y): + z = dummy12(y) + x, w = dummy13(z) + return z, x, w + def testfunc(n): + a = 0 + for _ in range(n): + c, d, e = dummy18(2) + b = dummy12(5) + a += b + c + d + e + return a + + self._assert_framesize_cross_platform((dummy12), 12) + self._assert_framesize_cross_platform((dummy13), 13) + self._assert_framesize_cross_platform((dummy18), 18) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 800) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 4) + self.assertEqual(uop_names.count("_POP_FRAME"), 4) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + # max(18 + max(12, 13), 12) == 31 + largest_stack = ( + _testinternalcapi.get_co_framesize(dummy18.__code__) + + _testinternalcapi.get_co_framesize(dummy13.__code__) + ) + self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) + + def test_combine_stack_space_complex(self): + def dummy0(x): + return x + def dummy1(x): + return dummy0(x) + def dummy2(x): + return dummy1(x) + def dummy3(x): + return dummy0(x) + def dummy4(x): + y = dummy0(x) + return dummy3(y) + def dummy5(x): + return dummy2(x) + def dummy6(x): + y = dummy5(x) + z = dummy0(y) + return dummy4(z) + def testfunc(n): + a = 0; + for _ in range(32): + b = dummy5(1) + c = dummy0(1) + d = dummy6(1) + a += b + c + d + return a + + self._assert_framesize_cross_platform(dummy0, 11) + self._assert_framesize_cross_platform(dummy1, 14) + self._assert_framesize_cross_platform(dummy2, 14) + self._assert_framesize_cross_platform(dummy3, 14) + self._assert_framesize_cross_platform(dummy4, 16) + self._assert_framesize_cross_platform(dummy5, 14) + self._assert_framesize_cross_platform(dummy6, 18) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 96) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 15) + self.assertEqual(uop_names.count("_POP_FRAME"), 15) + + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + largest_stack = ( + _testinternalcapi.get_co_framesize(dummy6.__code__) + + _testinternalcapi.get_co_framesize(dummy5.__code__) + + _testinternalcapi.get_co_framesize(dummy2.__code__) + + _testinternalcapi.get_co_framesize(dummy1.__code__) + + _testinternalcapi.get_co_framesize(dummy0.__code__) + ) + self.assertIn( + ("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands + ) + + def test_combine_stack_space_checks_large_framesize(self): + # Create a function with a large framesize. This ensures _CHECK_STACK_SPACE is + # actually doing its job. Note that the resulting trace hits + # UOP_MAX_TRACE_LENGTH, but since all _CHECK_STACK_SPACEs happen early, this + # test is still meaningful. + repetitions = 10000 + ns = {} + header = """ + def dummy_large(a0): + """ + body = "".join([f""" + a{n+1} = a{n} + 1 + """ for n in range(repetitions)]) + return_ = f""" + return a{repetitions-1} + """ + exec(textwrap.dedent(header + body + return_), ns, ns) + dummy_large = ns['dummy_large'] + + # this is something like: + # + # def dummy_large(a0): + # a1 = a0 + 1 + # a2 = a1 + 1 + # .... + # a99999 = a99998 + 1 + # return a99999 + + def dummy15(z): + y = dummy_large(z) + return y + 3 + + def testfunc(n): + b = 0 + for _ in range(n): + b += dummy15(7) + return b + + self._assert_framesize_cross_platform((dummy_large), repetitions + 12) + self._assert_framesize_cross_platform((dummy15), 15) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 32 * (repetitions + 9)) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + + # this hits a different case during trace projection in refcount test runs only, + # so we need to account for both possibilities + self.assertIn(uop_names.count("_CHECK_STACK_SPACE"), [0, 1]) + if uop_names.count("_CHECK_STACK_SPACE") == 0: + largest_stack = ( + _testinternalcapi.get_co_framesize(dummy15.__code__) + + _testinternalcapi.get_co_framesize(dummy_large.__code__) + ) + self.assertIn( + ("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands + ) + else: + largest_stack = _testinternalcapi.get_co_framesize(dummy15.__code__) + self.assertIn( + ("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands + ) + + def test_combine_stack_space_checks_recursion(self): + def dummy15(x): + while x > 0: + return dummy15(x - 1) + return 42 + def testfunc(n): + a = 0 + for _ in range(n): + a += dummy15(n) + return a + + self._assert_framesize_cross_platform((dummy15), 15) + + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 42 * 32) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) + self.assertEqual(uop_names.count("_POP_FRAME"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + largest_stack = _testinternalcapi.get_co_framesize(dummy15.__code__) * 2 + self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) + def test_many_nested(self): # overflow the trace_stack def dummy_a(x): @@ -970,14 +1271,42 @@ def dummy_g(x): return dummy_f(x) def dummy_h(x): return dummy_g(x) + def dummy_i(x): + return dummy_h(x) + def dummy_j(x): + return dummy_i(x) + def dummy_k(x): + return dummy_j(x) + def dummy_l(x): + return dummy_k(x) def testfunc(n): a = 0 for _ in range(n): - a += dummy_h(n) + a += dummy_l(n) return a - self._run_with_optimizer(testfunc, 32) + self._assert_framesize_cross_platform((dummy_l), 14) + res, ex = self._run_with_optimizer(testfunc, 32) + self.assertEqual(res, 32 * 32) + self.assertIsNotNone(ex) + + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uop_names = [uop[0] for uop in uops_and_operands] + self.assertEqual(uop_names.count("_PUSH_FRAME"), 6) + # we run out of trace stack depth before emitting any _POP_FRAMEs + self.assertEqual(uop_names.count("_POP_FRAME"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) + largest_stack = ( + _testinternalcapi.get_co_framesize(dummy_l.__code__) + + _testinternalcapi.get_co_framesize(dummy_k.__code__) + + _testinternalcapi.get_co_framesize(dummy_j.__code__) + + _testinternalcapi.get_co_framesize(dummy_i.__code__) + + _testinternalcapi.get_co_framesize(dummy_h.__code__) + + _testinternalcapi.get_co_framesize(dummy_g.__code__) + ) + self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) if __name__ == "__main__": unittest.main() diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index c07652facc0ae2..147c9643f8c7d6 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -957,6 +957,16 @@ iframe_getlasti(PyObject *self, PyObject *frame) return PyLong_FromLong(PyUnstable_InterpreterFrame_GetLasti(f)); } +static PyObject * +get_co_framesize(PyObject *self, PyObject *arg) { + if (!PyCode_Check(arg)) { + PyErr_SetString(PyExc_TypeError, "argument must be a code object"); + return NULL; + } + PyCodeObject *code = (PyCodeObject *)arg; + return PyLong_FromLong(code->co_framesize); +} + static PyObject * new_counter_optimizer(PyObject *self, PyObject *arg) { @@ -1814,6 +1824,7 @@ static PyMethodDef module_functions[] = { {"iframe_getcode", iframe_getcode, METH_O, NULL}, {"iframe_getline", iframe_getline, METH_O, NULL}, {"iframe_getlasti", iframe_getlasti, METH_O, NULL}, + {"get_co_framesize", get_co_framesize, METH_O, NULL}, {"get_optimizer", get_optimizer, METH_NOARGS, NULL}, {"set_optimizer", set_optimizer, METH_O, NULL}, {"new_counter_optimizer", new_counter_optimizer, METH_NOARGS, NULL}, diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 476975d2fbc3c2..ddc55a1930424c 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4094,6 +4094,12 @@ dummy_func( frame->instr_ptr = (_Py_CODEUNIT *)instr_ptr; } + tier2 op(_CHECK_STACK_SPACE_OPERAND, (framesize/4 --)) { + assert((uint64_t)framesize < INT_MAX); + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, (uint64_t)framesize)); + DEOPT_IF(tstate->py_recursion_remaining <= 1); + } + op(_SAVE_RETURN_OFFSET, (--)) { #if TIER_ONE frame->return_offset = (uint16_t)(next_instr - this_instr); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index a55daa2c344944..a48ea6b8f61a88 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3726,6 +3726,14 @@ break; } + case _CHECK_STACK_SPACE_OPERAND: { + PyObject *framesize = (PyObject *)CURRENT_OPERAND(); + assert((uint64_t)framesize < INT_MAX); + if (!_PyThreadState_HasStackSpace(tstate, (uint64_t)framesize)) goto deoptimize; + if (tstate->py_recursion_remaining <= 1) goto deoptimize; + break; + } + case _SAVE_RETURN_OFFSET: { oparg = CURRENT_OPARG(); #if TIER_ONE diff --git a/Python/optimizer.c b/Python/optimizer.c index f8c1390a061650..f75941504d13b3 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -399,6 +399,30 @@ PyTypeObject _PyUOpExecutor_Type = { .tp_clear = executor_clear, }; +/* + * Called during trace projection just after emitting _PUSH_FRAME. + * Convert the corresponding _CHECK_STACK_SPACE to _CHECK_STACK_SPACE_OPERAND + * and set its operand to the amount of space required. + */ +static void +update_last_check_stack_space( + _PyUOpInstruction *trace, + int trace_length, + PyCodeObject *new_code) +{ + if (new_code == NULL) { + return; + } + assert(trace[trace_length - 1].opcode == _PUSH_FRAME); + // -1 to _PUSH_FRAME; -3 from _PUSH_FRAME to _CHECK_STACK_SPACE + int last_check_stack_idx = trace_length - 4; + assert(last_check_stack_idx >= 0); + assert(trace[last_check_stack_idx].opcode == _CHECK_STACK_SPACE); + assert(new_code != NULL); + trace[last_check_stack_idx].opcode = _CHECK_STACK_SPACE_OPERAND; + trace[last_check_stack_idx].operand = new_code->co_framesize; +} + /* TO DO -- Generate these tables */ static const uint16_t _PyUOp_Replacements[MAX_UOP_ID + 1] = { @@ -456,6 +480,11 @@ BRANCH_TO_GUARD[4][2] = { trace_length++; #endif +#define ADD_PUSH_FRAME_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \ + assert((OPCODE) == _PUSH_FRAME); \ + ADD_TO_TRACE((OPCODE), (OPARG), (OPERAND), (TARGET)); \ + update_last_check_stack_space(trace, trace_length, new_code); \ + #define INSTR_IP(INSTR, CODE) \ ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) @@ -476,7 +505,7 @@ BRANCH_TO_GUARD[4][2] = { if (trace_stack_depth >= TRACE_STACK_SIZE) { \ DPRINTF(2, "Trace stack overflow\n"); \ OPT_STAT_INC(trace_stack_overflow); \ - ADD_TO_TRACE(uop, oparg, operand, target); \ + ADD_PUSH_FRAME_TO_TRACE(uop, oparg, operand, target); \ ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); \ goto done; \ } \ @@ -765,7 +794,7 @@ translate_bytecode_to_trace( PyUnicode_AsUTF8(new_code->co_filename), new_code->co_firstlineno); OPT_STAT_INC(recursive_call); - ADD_TO_TRACE(uop, oparg, 0, target); + ADD_PUSH_FRAME_TO_TRACE(uop, oparg, 0, target) ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); goto done; } @@ -774,7 +803,7 @@ translate_bytecode_to_trace( // Perhaps it may happen again, so don't bother tracing. // TODO: Reason about this -- is it better to bail or not? DPRINTF(2, "Bailing because co_version != func_version\n"); - ADD_TO_TRACE(uop, oparg, 0, target); + ADD_PUSH_FRAME_TO_TRACE(uop, oparg, 0, target); ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); goto done; } @@ -791,13 +820,10 @@ translate_bytecode_to_trace( if (new_func != NULL) { operand = (uintptr_t)new_func; } - else if (new_code != NULL) { - operand = (uintptr_t)new_code | 1; - } else { - operand = 0; + operand = (uintptr_t)new_code | 1; } - ADD_TO_TRACE(uop, oparg, operand, target); + ADD_PUSH_FRAME_TO_TRACE(uop, oparg, operand, target); code = new_code; func = new_func; instr = _PyCode_CODE(code); @@ -810,7 +836,7 @@ translate_bytecode_to_trace( goto top; } DPRINTF(2, "Bail, new_code == NULL\n"); - ADD_TO_TRACE(uop, oparg, 0, target); + ADD_PUSH_FRAME_TO_TRACE(uop, oparg, 0, target); ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); goto done; } diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 95924a57cfdaf4..ecc1fdeabee125 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -580,6 +580,88 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s } } +static void +combine_stack_space_checks( + _PyUOpInstruction *buffer, + int buffer_size +) +{ + /* Compute the maximum stack space needed for this trace and consolidate + * all _CHECK_STACK_SPACE_OPERAND ops to just one. + * This calculation is essentially a traversal over the tree of function + * calls in this trace. + */ +#ifdef Py_DEBUG + // assume there's a _PUSH_STACK after every _CHECK_STACK_SPACE_OPERAND + bool expecting_push = false; + /* We might see _CHECK_STACK_SPACE, but only if no code object was + * available for _PUSH_FRAME at trace projection time. If this occurs, + * it should be closely followed by _PUSH_FRAME + _EXIT_TRACE and must + * not be followed by any _CHECK_STACK_SPACE_OPERANDs. */ + bool saw_check_stack_space = false; +#endif + int depth = 0; + uint64_t space_at_depth[MAX_ABSTRACT_FRAME_DEPTH]; + uint64_t curr_space = 0; + uint64_t max_space = 0; + _PyUOpInstruction *first_check_stack = NULL; + for (int pc = 0; pc < buffer_size; pc++) { + int opcode = buffer[pc].opcode; + switch (opcode) { + case _CHECK_STACK_SPACE_OPERAND: { + assert(expecting_push == false); + assert(saw_check_stack_space == false); + if (first_check_stack == NULL) { + first_check_stack = &buffer[pc]; + } + else { + // delete all but the first _CHECK_STACK_SPACE_OPERAND + buffer[pc].opcode = _NOP; + } + space_at_depth[depth] = buffer[pc].operand; + curr_space += buffer[pc].operand; + max_space = curr_space > max_space ? curr_space : max_space; + depth++; + assert(depth < MAX_ABSTRACT_FRAME_DEPTH); +#ifdef Py_DEBUG + expecting_push = true; +#endif + break; + } + case _POP_FRAME: { + assert(expecting_push == false); + assert(saw_check_stack_space == false); + depth--; + assert(depth >= 0); + curr_space -= space_at_depth[depth]; + break; + } +#ifdef Py_DEBUG + case _CHECK_STACK_SPACE: { + assert(expecting_push == false); + expecting_push = true; + saw_check_stack_space = true; + break; + } + case _PUSH_FRAME: { + assert(expecting_push == true); + expecting_push = false; + break; + } +#endif + case _EXIT_TRACE: { + if (first_check_stack != NULL) { + assert(max_space < INT_MAX); + first_check_stack->operand = max_space; + } + return; + } + } + } + // must only return due to _EXIT_TRACE + assert(false); +} + // 0 - failure, no error raised, just fall back to Tier 1 // -1 - failure, and raise error // 1 - optimizer success @@ -614,6 +696,7 @@ _Py_uop_analyze_and_optimize( assert(err == 1); remove_unneeded_uops(buffer, buffer_size); + combine_stack_space_checks(buffer, buffer_size); OPT_STAT_INC(optimizer_successes); return 1; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 6aeea51e62584f..5979cff06979f8 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1942,6 +1942,10 @@ break; } + case _CHECK_STACK_SPACE_OPERAND: { + break; + } + case _SAVE_RETURN_OFFSET: { break; } From 5eca7ca3fe7bab2c8e43b87ad104f48c87935555 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Mon, 25 Mar 2024 18:50:24 -0700 Subject: [PATCH 02/12] Assorted cleanup and pipeline fix --- Lib/test/test_capi/test_opt.py | 3 --- Modules/_testinternalcapi.c | 2 +- Python/optimizer_analysis.c | 4 ++++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index d7cc7ee407a01c..de01fd171390fd 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -1218,9 +1218,6 @@ def testfunc(n): _testinternalcapi.get_co_framesize(dummy15.__code__) + _testinternalcapi.get_co_framesize(dummy_large.__code__) ) - self.assertIn( - ("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands - ) else: largest_stack = _testinternalcapi.get_co_framesize(dummy15.__code__) self.assertIn( diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 147c9643f8c7d6..72d1441a3d9d9a 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -960,7 +960,7 @@ iframe_getlasti(PyObject *self, PyObject *frame) static PyObject * get_co_framesize(PyObject *self, PyObject *arg) { if (!PyCode_Check(arg)) { - PyErr_SetString(PyExc_TypeError, "argument must be a code object"); + PyErr_SetString(PyExc_TypeError, "argument must be a code object"); return NULL; } PyCodeObject *code = (PyCodeObject *)arg; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index ecc1fdeabee125..cf921ba633129a 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -609,8 +609,10 @@ combine_stack_space_checks( int opcode = buffer[pc].opcode; switch (opcode) { case _CHECK_STACK_SPACE_OPERAND: { +#ifdef Py_DEBUG assert(expecting_push == false); assert(saw_check_stack_space == false); +#endif if (first_check_stack == NULL) { first_check_stack = &buffer[pc]; } @@ -629,8 +631,10 @@ combine_stack_space_checks( break; } case _POP_FRAME: { +#ifdef Py_DEBUG assert(expecting_push == false); assert(saw_check_stack_space == false); +#endif depth--; assert(depth >= 0); curr_space -= space_at_depth[depth]; From 1b1d8e12b2d28e51dbc6819d4354d7ec8b9aa667 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Tue, 26 Mar 2024 15:15:22 -0700 Subject: [PATCH 03/12] Fix loss of data warning and revert changes to optimizer.c --- Python/bytecodes.c | 6 +- Python/executor_cases.c.h | 6 +- Python/optimizer.c | 421 +++++++++++++++++++++++++----------- Python/optimizer_analysis.c | 16 +- 4 files changed, 308 insertions(+), 141 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index ddc55a1930424c..d16c04824171c1 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4094,9 +4094,9 @@ dummy_func( frame->instr_ptr = (_Py_CODEUNIT *)instr_ptr; } - tier2 op(_CHECK_STACK_SPACE_OPERAND, (framesize/4 --)) { - assert((uint64_t)framesize < INT_MAX); - DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, (uint64_t)framesize)); + tier2 op(_CHECK_STACK_SPACE_OPERAND, (framesize/2 --)) { + assert(framesize < INT_MAX); + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, framesize)); DEOPT_IF(tstate->py_recursion_remaining <= 1); } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index a48ea6b8f61a88..51f072989fb489 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3727,9 +3727,9 @@ } case _CHECK_STACK_SPACE_OPERAND: { - PyObject *framesize = (PyObject *)CURRENT_OPERAND(); - assert((uint64_t)framesize < INT_MAX); - if (!_PyThreadState_HasStackSpace(tstate, (uint64_t)framesize)) goto deoptimize; + uint32_t framesize = (uint32_t)CURRENT_OPERAND(); + assert(framesize < INT_MAX); + if (!_PyThreadState_HasStackSpace(tstate, framesize)) goto deoptimize; if (tstate->py_recursion_remaining <= 1) goto deoptimize; break; } diff --git a/Python/optimizer.c b/Python/optimizer.c index f75941504d13b3..38ab6d3cf61c72 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -154,13 +154,19 @@ PyUnstable_GetOptimizer(void) } static _PyExecutorObject * -make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies); +make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies); static int init_cold_exit_executor(_PyExecutorObject *executor, int oparg); +/* It is impossible for the number of exits to reach 1/4 of the total length, + * as the number of exits cannot reach 1/3 of the number of non-exits, due to + * the presence of CHECK_VALIDITY checks and instructions to produce the values + * being checked in exits. */ +#define COLD_EXIT_COUNT (UOP_MAX_TRACE_LENGTH/4) + static int cold_exits_initialized = 0; -static _PyExecutorObject COLD_EXITS[UOP_MAX_TRACE_LENGTH] = { 0 }; +static _PyExecutorObject COLD_EXITS[COLD_EXIT_COUNT] = { 0 }; static const _PyBloomFilter EMPTY_FILTER = { 0 }; @@ -172,7 +178,7 @@ _Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer) } else if (cold_exits_initialized == 0) { cold_exits_initialized = 1; - for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) { + for (int i = 0; i < COLD_EXIT_COUNT; i++) { if (init_cold_exit_executor(&COLD_EXITS[i], i)) { return NULL; } @@ -313,10 +319,33 @@ _PyUOpPrint(const _PyUOpInstruction *uop) else { printf("%s", name); } - printf(" (%d, target=%d, operand=%#" PRIx64 ")", - uop->oparg, - uop->target, - (uint64_t)uop->operand); + switch(uop->format) { + case UOP_FORMAT_TARGET: + printf(" (%d, target=%d, operand=%#" PRIx64, + uop->oparg, + uop->target, + (uint64_t)uop->operand); + break; + case UOP_FORMAT_JUMP: + printf(" (%d, jump_target=%d, operand=%#" PRIx64, + uop->oparg, + uop->jump_target, + (uint64_t)uop->operand); + break; + case UOP_FORMAT_EXIT: + printf(" (%d, exit_index=%d, operand=%#" PRIx64, + uop->oparg, + uop->exit_index, + (uint64_t)uop->operand); + break; + default: + printf(" (%d, Unknown format)", uop->oparg); + } + if (_PyUop_Flags[uop->opcode] & HAS_ERROR_FLAG) { + printf(", error_target=%d", uop->error_target); + } + + printf(")"); } #endif @@ -399,30 +428,6 @@ PyTypeObject _PyUOpExecutor_Type = { .tp_clear = executor_clear, }; -/* - * Called during trace projection just after emitting _PUSH_FRAME. - * Convert the corresponding _CHECK_STACK_SPACE to _CHECK_STACK_SPACE_OPERAND - * and set its operand to the amount of space required. - */ -static void -update_last_check_stack_space( - _PyUOpInstruction *trace, - int trace_length, - PyCodeObject *new_code) -{ - if (new_code == NULL) { - return; - } - assert(trace[trace_length - 1].opcode == _PUSH_FRAME); - // -1 to _PUSH_FRAME; -3 from _PUSH_FRAME to _CHECK_STACK_SPACE - int last_check_stack_idx = trace_length - 4; - assert(last_check_stack_idx >= 0); - assert(trace[last_check_stack_idx].opcode == _CHECK_STACK_SPACE); - assert(new_code != NULL); - trace[last_check_stack_idx].opcode = _CHECK_STACK_SPACE_OPERAND; - trace[last_check_stack_idx].operand = new_code->co_framesize; -} - /* TO DO -- Generate these tables */ static const uint16_t _PyUOp_Replacements[MAX_UOP_ID + 1] = { @@ -456,35 +461,38 @@ BRANCH_TO_GUARD[4][2] = { #endif -// Beware: Macro arg order differs from struct member order +static inline int +add_to_trace( + _PyUOpInstruction *trace, + int trace_length, + uint16_t opcode, + uint16_t oparg, + uint64_t operand, + uint32_t target) +{ + trace[trace_length].opcode = opcode; + trace[trace_length].format = UOP_FORMAT_TARGET; + trace[trace_length].target = target; + trace[trace_length].oparg = oparg; + trace[trace_length].operand = operand; + return trace_length + 1; +} + #ifdef Py_DEBUG #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \ assert(trace_length < max_length); \ - trace[trace_length].opcode = (OPCODE); \ - trace[trace_length].oparg = (OPARG); \ - trace[trace_length].target = (TARGET); \ - trace[trace_length].operand = (OPERAND); \ + trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); \ if (lltrace >= 2) { \ printf("%4d ADD_TO_TRACE: ", trace_length); \ - _PyUOpPrint(&trace[trace_length]); \ + _PyUOpPrint(&trace[trace_length-1]); \ printf("\n"); \ - } \ - trace_length++; + } #else #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \ assert(trace_length < max_length); \ - trace[trace_length].opcode = (OPCODE); \ - trace[trace_length].oparg = (OPARG); \ - trace[trace_length].target = (TARGET); \ - trace[trace_length].operand = (OPERAND); \ - trace_length++; + trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); #endif -#define ADD_PUSH_FRAME_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \ - assert((OPCODE) == _PUSH_FRAME); \ - ADD_TO_TRACE((OPCODE), (OPARG), (OPERAND), (TARGET)); \ - update_last_check_stack_space(trace, trace_length, new_code); \ - #define INSTR_IP(INSTR, CODE) \ ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) @@ -505,8 +513,7 @@ BRANCH_TO_GUARD[4][2] = { if (trace_stack_depth >= TRACE_STACK_SIZE) { \ DPRINTF(2, "Trace stack overflow\n"); \ OPT_STAT_INC(trace_stack_overflow); \ - ADD_PUSH_FRAME_TO_TRACE(uop, oparg, operand, target); \ - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); \ + trace_length = 0; \ goto done; \ } \ assert(func == NULL || func->func_code == (PyObject *)code); \ @@ -524,7 +531,7 @@ BRANCH_TO_GUARD[4][2] = { assert(func == NULL || func->func_code == (PyObject *)code); \ instr = trace_stack[trace_stack_depth].instr; -/* Returns 1 on success, +/* Returns the length of the trace on success, * 0 if it failed to produce a worthwhile trace, * and -1 on an error. */ @@ -544,7 +551,8 @@ translate_bytecode_to_trace( _Py_BloomFilter_Add(dependencies, initial_code); _Py_CODEUNIT *initial_instr = instr; int trace_length = 0; - int max_length = buffer_size; + // Leave space for possible trailing _EXIT_TRACE + int max_length = buffer_size-2; struct { PyFunctionObject *func; PyCodeObject *code; @@ -567,13 +575,16 @@ translate_bytecode_to_trace( PyUnicode_AsUTF8(code->co_filename), code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); + ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code)); uint32_t target = 0; top: // Jump here after _PUSH_FRAME or likely branches for (;;) { target = INSTR_IP(instr, code); - RESERVE_RAW(2, "epilogue"); // Always need space for _SET_IP, _CHECK_VALIDITY and _EXIT_TRACE + RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP"); ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target); + // Need space for _DEOPT + max_length--; uint32_t opcode = instr->op.code; uint32_t oparg = instr->op.arg; @@ -611,13 +622,22 @@ translate_bytecode_to_trace( continue; } else { - if (OPCODE_HAS_DEOPT(opcode)) { + if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) { opcode = _PyOpcode_Deopt[opcode]; } + assert(!OPCODE_HAS_EXIT(opcode)); assert(!OPCODE_HAS_DEOPT(opcode)); } } + if (OPCODE_HAS_EXIT(opcode)) { + // Make space for exit code + max_length--; + } + if (OPCODE_HAS_ERROR(opcode)) { + // Make space for error code + max_length--; + } switch (opcode) { case POP_JUMP_IF_NONE: case POP_JUMP_IF_NOT_NONE: @@ -653,10 +673,10 @@ translate_bytecode_to_trace( DPRINTF(2, "Jump likely (%04x = %d bits), continue at byte offset %d\n", instr[1].cache, bitcount, 2 * INSTR_IP(target_instr, code)); instr = target_instr; - ADD_TO_TRACE(uopcode, max_length, 0, INSTR_IP(next_instr, code)); + ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(next_instr, code)); goto top; } - ADD_TO_TRACE(uopcode, max_length, 0, INSTR_IP(target_instr, code)); + ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(target_instr, code)); break; } @@ -794,7 +814,7 @@ translate_bytecode_to_trace( PyUnicode_AsUTF8(new_code->co_filename), new_code->co_firstlineno); OPT_STAT_INC(recursive_call); - ADD_PUSH_FRAME_TO_TRACE(uop, oparg, 0, target) + ADD_TO_TRACE(uop, oparg, 0, target); ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); goto done; } @@ -803,7 +823,7 @@ translate_bytecode_to_trace( // Perhaps it may happen again, so don't bother tracing. // TODO: Reason about this -- is it better to bail or not? DPRINTF(2, "Bailing because co_version != func_version\n"); - ADD_PUSH_FRAME_TO_TRACE(uop, oparg, 0, target); + ADD_TO_TRACE(uop, oparg, 0, target); ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); goto done; } @@ -820,10 +840,13 @@ translate_bytecode_to_trace( if (new_func != NULL) { operand = (uintptr_t)new_func; } - else { + else if (new_code != NULL) { operand = (uintptr_t)new_code | 1; } - ADD_PUSH_FRAME_TO_TRACE(uop, oparg, operand, target); + else { + operand = 0; + } + ADD_TO_TRACE(uop, oparg, operand, target); code = new_code; func = new_func; instr = _PyCode_CODE(code); @@ -836,7 +859,7 @@ translate_bytecode_to_trace( goto top; } DPRINTF(2, "Bail, new_code == NULL\n"); - ADD_PUSH_FRAME_TO_TRACE(uop, oparg, 0, target); + ADD_TO_TRACE(uop, oparg, 0, target); ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0); goto done; } @@ -875,7 +898,9 @@ translate_bytecode_to_trace( progress_needed ? "no progress" : "too short"); return 0; } - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + if (trace[trace_length-1].opcode != _JUMP_TO_TOP) { + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + } DPRINTF(1, "Created a proto-trace for %s (%s:%d) at byte offset %d -- length %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -883,8 +908,8 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code), trace_length); - OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist); - return 1; + OPT_HIST(trace_length, trace_length_hist); + return trace_length; } #undef RESERVE @@ -897,43 +922,86 @@ translate_bytecode_to_trace( #define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31))) #define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31))) -/* Count the number of used uops, and mark them in the bit vector `used`. - * This can be done in a single pass using simple reachability analysis, - * as there are no backward jumps. - * NOPs are excluded from the count. +/* Count the number of unused uops and exits */ static int -compute_used(_PyUOpInstruction *buffer, uint32_t *used, int *exit_count_ptr) +count_exits(_PyUOpInstruction *buffer, int length) { - int count = 0; int exit_count = 0; - SET_BIT(used, 0); - for (int i = 0; i < UOP_MAX_TRACE_LENGTH; i++) { - if (!BIT_IS_SET(used, i)) { - continue; - } - count++; + for (int i = 0; i < length; i++) { int opcode = buffer[i].opcode; - if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) { + if (opcode == _SIDE_EXIT) { exit_count++; } - if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { - continue; + } + return exit_count; +} + +static void make_exit(_PyUOpInstruction *inst, int opcode, int target) +{ + inst->opcode = opcode; + inst->oparg = 0; + inst->format = UOP_FORMAT_TARGET; + inst->target = target; +} + +/* Convert implicit exits, errors and deopts + * into explicit ones. */ +static int +prepare_for_execution(_PyUOpInstruction *buffer, int length) +{ + int32_t current_jump = -1; + int32_t current_jump_target = -1; + int32_t current_error = -1; + int32_t current_error_target = -1; + int32_t current_popped = -1; + /* Leaving in NOPs slows down the interpreter and messes up the stats */ + _PyUOpInstruction *copy_to = &buffer[0]; + for (int i = 0; i < length; i++) { + _PyUOpInstruction *inst = &buffer[i]; + if (inst->opcode != _NOP) { + if (copy_to != inst) { + *copy_to = *inst; + } + copy_to++; } - /* All other micro-ops fall through, so i+1 is reachable */ - SET_BIT(used, i+1); - assert(opcode <= MAX_UOP_ID); - if (_PyUop_Flags[opcode] & HAS_JUMP_FLAG) { - /* Mark target as reachable */ - SET_BIT(used, buffer[i].oparg); + } + length = (int)(copy_to - buffer); + int next_spare = length; + for (int i = 0; i < length; i++) { + _PyUOpInstruction *inst = &buffer[i]; + int opcode = inst->opcode; + int32_t target = (int32_t)uop_get_target(inst); + if (_PyUop_Flags[opcode] & (HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) { + if (target != current_jump_target) { + uint16_t exit_op = (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) ? _SIDE_EXIT : _DEOPT; + make_exit(&buffer[next_spare], exit_op, target); + current_jump_target = target; + current_jump = next_spare; + next_spare++; + } + buffer[i].jump_target = current_jump; + buffer[i].format = UOP_FORMAT_JUMP; } - if (opcode == NOP) { - count--; - UNSET_BIT(used, i); + if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) { + int popped = (_PyUop_Flags[opcode] & HAS_ERROR_NO_POP_FLAG) ? + 0 : _PyUop_num_popped(opcode, inst->oparg); + if (target != current_error_target || popped != current_popped) { + current_popped = popped; + current_error = next_spare; + current_error_target = target; + make_exit(&buffer[next_spare], _ERROR_POP_N, 0); + buffer[next_spare].oparg = popped; + next_spare++; + } + buffer[i].error_target = current_error; + if (buffer[i].format == UOP_FORMAT_TARGET) { + buffer[i].format = UOP_FORMAT_JUMP; + buffer[i].jump_target = 0; + } } } - *exit_count_ptr = exit_count; - return count; + return next_spare; } /* Executor side exits */ @@ -952,61 +1020,118 @@ allocate_executor(int exit_count, int length) return res; } +#ifdef Py_DEBUG + +#define CHECK(PRED) \ +if (!(PRED)) { \ + printf(#PRED " at %d\n", i); \ + assert(0); \ +} + +static int +target_unused(int opcode) +{ + return (_PyUop_Flags[opcode] & (HAS_ERROR_FLAG | HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) == 0; +} + +static void +sanity_check(_PyExecutorObject *executor) +{ + for (uint32_t i = 0; i < executor->exit_count; i++) { + _PyExitData *exit = &executor->exits[i]; + CHECK(exit->target < (1 << 25)); + } + bool ended = false; + uint32_t i = 0; + CHECK(executor->trace[0].opcode == _START_EXECUTOR || executor->trace[0].opcode == _COLD_EXIT); + for (; i < executor->code_size; i++) { + const _PyUOpInstruction *inst = &executor->trace[i]; + uint16_t opcode = inst->opcode; + CHECK(opcode <= MAX_UOP_ID); + CHECK(_PyOpcode_uop_name[opcode] != NULL); + switch(inst->format) { + case UOP_FORMAT_TARGET: + CHECK(target_unused(opcode)); + break; + case UOP_FORMAT_EXIT: + CHECK(opcode == _SIDE_EXIT); + CHECK(inst->exit_index < executor->exit_count); + break; + case UOP_FORMAT_JUMP: + CHECK(inst->jump_target < executor->code_size); + break; + case UOP_FORMAT_UNUSED: + CHECK(0); + break; + } + if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) { + CHECK(inst->format == UOP_FORMAT_JUMP); + CHECK(inst->error_target < executor->code_size); + } + if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE || opcode == _COLD_EXIT) { + ended = true; + i++; + break; + } + } + CHECK(ended); + for (; i < executor->code_size; i++) { + const _PyUOpInstruction *inst = &executor->trace[i]; + uint16_t opcode = inst->opcode; + CHECK( + opcode == _DEOPT || + opcode == _SIDE_EXIT || + opcode == _ERROR_POP_N); + if (opcode == _SIDE_EXIT) { + CHECK(inst->format == UOP_FORMAT_EXIT); + } + } +} + +#undef CHECK +#endif + /* Makes an executor from a buffer of uops. * Account for the buffer having gaps and NOPs by computing a "used" * bit vector and only copying the used uops. Here "used" means reachable * and not a NOP. */ static _PyExecutorObject * -make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *dependencies) +make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies) { - uint32_t used[(UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 }; - int exit_count; - int length = compute_used(buffer, used, &exit_count); - length += 1; // For _START_EXECUTOR + int exit_count = count_exits(buffer, length); _PyExecutorObject *executor = allocate_executor(exit_count, length); if (executor == NULL) { return NULL; } - OPT_HIST(length, optimized_trace_length_hist); /* Initialize exits */ + assert(exit_count < COLD_EXIT_COUNT); for (int i = 0; i < exit_count; i++) { executor->exits[i].executor = &COLD_EXITS[i]; executor->exits[i].temperature = 0; } int next_exit = exit_count-1; - _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length-1]; - /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ - for (int i = UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { - if (!BIT_IS_SET(used, i)) { - continue; - } - *dest = buffer[i]; + _PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length]; + assert(buffer[0].opcode == _START_EXECUTOR); + buffer[0].operand = (uint64_t)executor; + for (int i = length-1; i >= 0; i--) { int opcode = buffer[i].opcode; - if (opcode == _POP_JUMP_IF_FALSE || - opcode == _POP_JUMP_IF_TRUE) - { - /* The oparg of the target will already have been set to its new offset */ - int oparg = dest->oparg; - dest->oparg = buffer[oparg].oparg; - } - if (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) { + dest--; + *dest = buffer[i]; + assert(opcode != _POP_JUMP_IF_FALSE && opcode != _POP_JUMP_IF_TRUE); + if (opcode == _SIDE_EXIT) { executor->exits[next_exit].target = buffer[i].target; dest->exit_index = next_exit; + dest->format = UOP_FORMAT_EXIT; next_exit--; } - /* Set the oparg to be the destination offset, - * so that we can set the oparg of earlier jumps correctly. */ - buffer[i].oparg = (uint16_t)(dest - executor->trace); - dest--; } assert(next_exit == -1); assert(dest == executor->trace); - dest->opcode = _START_EXECUTOR; + assert(dest->opcode == _START_EXECUTOR); dest->oparg = 0; dest->target = 0; - dest->operand = (uintptr_t)executor; _Py_ExecutorInit(executor, dependencies); #ifdef Py_DEBUG char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); @@ -1022,6 +1147,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, const _PyBloomFilter *depende printf("\n"); } } + sanity_check(executor); #endif #ifdef _Py_JIT executor->jit_code = NULL; @@ -1050,6 +1176,9 @@ init_cold_exit_executor(_PyExecutorObject *executor, int oparg) for (int i = 0; i < BLOOM_FILTER_WORDS; i++) { assert(executor->vm_data.bloom.bits[i] == 0); } +#ifdef Py_DEBUG + sanity_check(executor); +#endif #ifdef _Py_JIT executor->jit_code = NULL; executor->jit_size = 0; @@ -1060,6 +1189,28 @@ init_cold_exit_executor(_PyExecutorObject *executor, int oparg) return 0; } +#ifdef Py_STATS +/* Returns the effective trace length. + * Ignores NOPs and trailing exit and error handling.*/ +int effective_trace_length(_PyUOpInstruction *buffer, int length) +{ + int nop_count = 0; + for (int i = 0; i < length; i++) { + int opcode = buffer[i].opcode; + if (opcode == _NOP) { + nop_count++; + } + if (opcode == _EXIT_TRACE || + opcode == _JUMP_TO_TOP || + opcode == _COLD_EXIT) { + return i+1-nop_count; + } + } + Py_FatalError("No terminating instruction"); + Py_UNREACHABLE(); +} +#endif + static int uop_optimize( _PyOptimizerObject *self, @@ -1072,24 +1223,26 @@ uop_optimize( _Py_BloomFilter_Init(&dependencies); _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH]; OPT_STAT_INC(attempts); - int err = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); - if (err <= 0) { + int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies); + if (length <= 0) { // Error or nothing translated - return err; + return length; } + assert(length < UOP_MAX_TRACE_LENGTH); OPT_STAT_INC(traces_created); char *env_var = Py_GETENV("PYTHON_UOPS_OPTIMIZE"); if (env_var == NULL || *env_var == '\0' || *env_var > '0') { - err = _Py_uop_analyze_and_optimize(frame, buffer, - UOP_MAX_TRACE_LENGTH, + length = _Py_uop_analyze_and_optimize(frame, buffer, + length, curr_stackentries, &dependencies); - if (err <= 0) { - return err; + if (length <= 0) { + return length; } } - assert(err == 1); + assert(length < UOP_MAX_TRACE_LENGTH); + assert(length >= 1); /* Fix up */ - for (int pc = 0; pc < UOP_MAX_TRACE_LENGTH; pc++) { + for (int pc = 0; pc < length; pc++) { int opcode = buffer[pc].opcode; int oparg = buffer[pc].oparg; if (_PyUop_Flags[opcode] & HAS_OPARG_AND_1_FLAG) { @@ -1104,10 +1257,14 @@ uop_optimize( assert(_PyOpcode_uop_name[buffer[pc].opcode]); assert(strncmp(_PyOpcode_uop_name[buffer[pc].opcode], _PyOpcode_uop_name[opcode], strlen(_PyOpcode_uop_name[opcode])) == 0); } - _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies); + OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist); + length = prepare_for_execution(buffer, length); + assert(length <= UOP_MAX_TRACE_LENGTH); + _PyExecutorObject *executor = make_executor_from_uops(buffer, length, &dependencies); if (executor == NULL) { return -1; } + assert(length <= UOP_MAX_TRACE_LENGTH); *exec_ptr = executor; return 1; } @@ -1182,12 +1339,14 @@ counter_optimize( return 0; } _Py_CODEUNIT *target = instr + 1 + _PyOpcode_Caches[JUMP_BACKWARD] - oparg; - _PyUOpInstruction buffer[3] = { + _PyUOpInstruction buffer[5] = { + { .opcode = _START_EXECUTOR }, { .opcode = _LOAD_CONST_INLINE_BORROW, .operand = (uintptr_t)self }, { .opcode = _INTERNAL_INCREMENT_OPT_COUNTER }, - { .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)) } + { .opcode = _EXIT_TRACE, .jump_target = 4, .format=UOP_FORMAT_JUMP }, + { .opcode = _SIDE_EXIT, .target = (uint32_t)(target - _PyCode_CODE(code)), .format=UOP_FORMAT_TARGET } }; - _PyExecutorObject *executor = make_executor_from_uops(buffer, &EMPTY_FILTER); + _PyExecutorObject *executor = make_executor_from_uops(buffer, 5, &EMPTY_FILTER); if (executor == NULL) { return -1; } diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index cf921ba633129a..61fd42b2ac0368 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -580,6 +580,14 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s } } +// TODO: there are no more changes to optimizer.c. We will only see regular +// _CHECK_STACK_SPACEs in this function. Need to make the following change: +// when we see _CHECK_STACK_SPACE, look ahead to _PUSH_FRAME. Extract the +// code object out of _PUSH_FRAME. Save that object's framesize and convert +// _CHECK_STACK_SPACE to _CHECK_STACK_SPACE_OPERAND. If we ever can't get a +// code object, stop the optimization. It should still be ok to save max_space +// to the first _CHECK_STACK_SPACE_OPERAND at that point, since the rest will +// be regular _CHECK_STACK_SPACEs and will be handled at execution time. static void combine_stack_space_checks( _PyUOpInstruction *buffer, @@ -592,7 +600,7 @@ combine_stack_space_checks( * calls in this trace. */ #ifdef Py_DEBUG - // assume there's a _PUSH_STACK after every _CHECK_STACK_SPACE_OPERAND + // assume there's a _PUSH_FRAME after every _CHECK_STACK_SPACE_OPERAND bool expecting_push = false; /* We might see _CHECK_STACK_SPACE, but only if no code object was * available for _PUSH_FRAME at trace projection time. If this occurs, @@ -601,9 +609,9 @@ combine_stack_space_checks( bool saw_check_stack_space = false; #endif int depth = 0; - uint64_t space_at_depth[MAX_ABSTRACT_FRAME_DEPTH]; - uint64_t curr_space = 0; - uint64_t max_space = 0; + uint32_t space_at_depth[MAX_ABSTRACT_FRAME_DEPTH]; + uint32_t curr_space = 0; + uint32_t max_space = 0; _PyUOpInstruction *first_check_stack = NULL; for (int pc = 0; pc < buffer_size; pc++) { int opcode = buffer[pc].opcode; From 617e4f30bb6daf4b215026a5a64cb0e2802e97b9 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Wed, 27 Mar 2024 17:56:56 -0700 Subject: [PATCH 04/12] Move all _CHECK_STACK_SPACE consolidation to optimizer_analysis.c --- Lib/test/test_capi/test_opt.py | 35 ++--------- Python/optimizer_analysis.c | 110 +++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 72 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index de01fd171390fd..8432147516ab6a 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -1245,9 +1245,9 @@ def testfunc(n): uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) self.assertEqual(uop_names.count("_POP_FRAME"), 0) - self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) + self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 1) self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) - largest_stack = _testinternalcapi.get_co_framesize(dummy15.__code__) * 2 + largest_stack = _testinternalcapi.get_co_framesize(dummy15.__code__) self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) def test_many_nested(self): @@ -1268,42 +1268,15 @@ def dummy_g(x): return dummy_f(x) def dummy_h(x): return dummy_g(x) - def dummy_i(x): - return dummy_h(x) - def dummy_j(x): - return dummy_i(x) - def dummy_k(x): - return dummy_j(x) - def dummy_l(x): - return dummy_k(x) def testfunc(n): a = 0 for _ in range(n): - a += dummy_l(n) + a += dummy_h(n) return a - self._assert_framesize_cross_platform((dummy_l), 14) - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 32 * 32) - self.assertIsNotNone(ex) - - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] - uop_names = [uop[0] for uop in uops_and_operands] - self.assertEqual(uop_names.count("_PUSH_FRAME"), 6) - # we run out of trace stack depth before emitting any _POP_FRAMEs - self.assertEqual(uop_names.count("_POP_FRAME"), 0) - self.assertEqual(uop_names.count("_CHECK_STACK_SPACE"), 0) - self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) - largest_stack = ( - _testinternalcapi.get_co_framesize(dummy_l.__code__) + - _testinternalcapi.get_co_framesize(dummy_k.__code__) + - _testinternalcapi.get_co_framesize(dummy_j.__code__) + - _testinternalcapi.get_co_framesize(dummy_i.__code__) + - _testinternalcapi.get_co_framesize(dummy_h.__code__) + - _testinternalcapi.get_co_framesize(dummy_g.__code__) - ) - self.assertIn(("_CHECK_STACK_SPACE_OPERAND", largest_stack), uops_and_operands) + self.assertIsNone(ex) if __name__ == "__main__": unittest.main() diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index a8acfe90c2d4b9..88dc714d26afdc 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -533,6 +533,29 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) Py_UNREACHABLE(); } +/* _PUSH_FRAME/_POP_FRAME's operand can be 0, a PyFunctionObject *, or a + * PyCodeObject *. Retrieve the code object if possible. + */ +static PyCodeObject * +get_co(_PyUOpInstruction *op) { + assert(op->opcode == _PUSH_FRAME || op->opcode == _POP_FRAME); + PyCodeObject *co = NULL; + uint64_t operand = op->operand; + if (operand == 0) { + return NULL; + } + if (operand & 1) { + co = (PyCodeObject *)(operand & ~1); + } + else { + PyFunctionObject *func = (PyFunctionObject *)operand; + assert(PyFunction_Check(func)); + co = (PyCodeObject *)func->func_code; + } + assert(PyCode_Check(co)); + return co; +} + static void peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) { @@ -561,7 +584,7 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s { uint64_t operand = buffer[pc].operand; if (operand & 1) { - co = (PyCodeObject *)(operand & ~1); + co = (PyCodeObject *)(operand & ~1); // TODO: use get_co assert(PyCode_Check(co)); } else if (operand == 0) { @@ -581,68 +604,75 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s } } -// TODO: there are no more changes to optimizer.c. We will only see regular -// _CHECK_STACK_SPACEs in this function. Need to make the following change: -// when we see _CHECK_STACK_SPACE, look ahead to _PUSH_FRAME. Extract the -// code object out of _PUSH_FRAME. Save that object's framesize and convert -// _CHECK_STACK_SPACE to _CHECK_STACK_SPACE_OPERAND. If we ever can't get a -// code object, stop the optimization. It should still be ok to save max_space -// to the first _CHECK_STACK_SPACE_OPERAND at that point, since the rest will -// be regular _CHECK_STACK_SPACEs and will be handled at execution time. +// _PUSH_FRAME is 3 ops in front of _CHECK_STACK_SPACE +#define _CHECK_STACK_TO_PUSH_FRAME_OFFSET 3 + +/* Compute the maximum stack space needed for this trace and consolidate + * all possible _CHECK_STACK_SPACE ops to one _CHECK_STACK_SPACE_OPERAND. + * This calculation is essentially a traversal over the tree of function + * calls in this trace. + */ static void combine_stack_space_checks( _PyUOpInstruction *buffer, int buffer_size ) { - /* Compute the maximum stack space needed for this trace and consolidate - * all _CHECK_STACK_SPACE_OPERAND ops to just one. - * This calculation is essentially a traversal over the tree of function - * calls in this trace. - */ #ifdef Py_DEBUG - // assume there's a _PUSH_FRAME after every _CHECK_STACK_SPACE_OPERAND + // assume there's a _PUSH_FRAME after every _CHECK_STACK_SPACE bool expecting_push = false; - /* We might see _CHECK_STACK_SPACE, but only if no code object was - * available for _PUSH_FRAME at trace projection time. If this occurs, - * it should be closely followed by _PUSH_FRAME + _EXIT_TRACE and must - * not be followed by any _CHECK_STACK_SPACE_OPERANDs. */ - bool saw_check_stack_space = false; + // if we ever can't get a framesize from _PUSH_FRAME, must exit soon + bool must_exit = false; #endif int depth = 0; uint32_t space_at_depth[MAX_ABSTRACT_FRAME_DEPTH]; uint32_t curr_space = 0; uint32_t max_space = 0; - _PyUOpInstruction *first_check_stack = NULL; + _PyUOpInstruction *first_valid_check_stack = NULL; for (int pc = 0; pc < buffer_size; pc++) { int opcode = buffer[pc].opcode; switch (opcode) { - case _CHECK_STACK_SPACE_OPERAND: { + case _CHECK_STACK_SPACE: { #ifdef Py_DEBUG assert(expecting_push == false); - assert(saw_check_stack_space == false); + assert(must_exit == false); + expecting_push = true; +#endif + uint32_t framesize = 0; + if (pc + _CHECK_STACK_TO_PUSH_FRAME_OFFSET < buffer_size) { + _PyUOpInstruction *uop = &buffer[ + pc + _CHECK_STACK_TO_PUSH_FRAME_OFFSET + ]; + assert(uop->opcode == _PUSH_FRAME); + PyCodeObject *code = get_co(uop); + if (code != NULL) { + framesize = code->co_framesize; + } + } + if (framesize == 0) { +#ifdef Py_DEBUG + must_exit = true; #endif - if (first_check_stack == NULL) { - first_check_stack = &buffer[pc]; + break; + } + if (first_valid_check_stack == NULL) { + first_valid_check_stack = &buffer[pc]; } else { - // delete all but the first _CHECK_STACK_SPACE_OPERAND + // delete all but the first valid _CHECK_STACK_SPACE buffer[pc].opcode = _NOP; } - space_at_depth[depth] = buffer[pc].operand; - curr_space += buffer[pc].operand; + space_at_depth[depth] = framesize; + curr_space += framesize; max_space = curr_space > max_space ? curr_space : max_space; depth++; assert(depth < MAX_ABSTRACT_FRAME_DEPTH); -#ifdef Py_DEBUG - expecting_push = true; -#endif break; } case _POP_FRAME: { #ifdef Py_DEBUG assert(expecting_push == false); - assert(saw_check_stack_space == false); + assert(must_exit == false); #endif depth--; assert(depth >= 0); @@ -650,22 +680,20 @@ combine_stack_space_checks( break; } #ifdef Py_DEBUG - case _CHECK_STACK_SPACE: { - assert(expecting_push == false); - expecting_push = true; - saw_check_stack_space = true; - break; - } case _PUSH_FRAME: { assert(expecting_push == true); expecting_push = false; break; } #endif + case _JUMP_TO_TOP: case _EXIT_TRACE: { - if (first_check_stack != NULL) { + if (first_valid_check_stack != NULL) { + assert(first_valid_check_stack->opcode == _CHECK_STACK_SPACE); + assert(max_space > 0); assert(max_space < INT_MAX); - first_check_stack->operand = max_space; + first_valid_check_stack->opcode = _CHECK_STACK_SPACE_OPERAND; + first_valid_check_stack->operand = max_space; } return; } @@ -704,7 +732,7 @@ _Py_uop_analyze_and_optimize( return length; } - combine_stack_space_checks(buffer, buffer_size); + combine_stack_space_checks(buffer, length); length = remove_unneeded_uops(buffer, length); assert(length > 0); From 95ef8234f7b8da0cfa5365f02c859920b9557019 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Wed, 27 Mar 2024 18:01:51 -0700 Subject: [PATCH 05/12] Use get_co in peephole_opt --- Python/optimizer_analysis.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 88dc714d26afdc..a4a5531d244c5f 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -582,19 +582,7 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s case _PUSH_FRAME: case _POP_FRAME: { - uint64_t operand = buffer[pc].operand; - if (operand & 1) { - co = (PyCodeObject *)(operand & ~1); // TODO: use get_co - assert(PyCode_Check(co)); - } - else if (operand == 0) { - co = NULL; - } - else { - PyFunctionObject *func = (PyFunctionObject *)operand; - assert(PyFunction_Check(func)); - co = (PyCodeObject *)func->func_code; - } + co = get_co(&buffer[pc]); break; } case _JUMP_TO_TOP: From defdc2b235c9c661b9e75d6b597e54ed94915ea5 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Wed, 27 Mar 2024 18:12:24 -0700 Subject: [PATCH 06/12] clean up diff --- Python/optimizer_analysis.c | 72 ++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index a4a5531d244c5f..17cdf861d6b35a 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -556,42 +556,6 @@ get_co(_PyUOpInstruction *op) { return co; } -static void -peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) -{ - PyCodeObject *co = _PyFrame_GetCode(frame); - for (int pc = 0; pc < buffer_size; pc++) { - int opcode = buffer[pc].opcode; - switch(opcode) { - case _LOAD_CONST: { - assert(co != NULL); - PyObject *val = PyTuple_GET_ITEM(co->co_consts, buffer[pc].oparg); - buffer[pc].opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE; - buffer[pc].operand = (uintptr_t)val; - break; - } - case _CHECK_PEP_523: - { - /* Setting the eval frame function invalidates - * all executors, so no need to check dynamically */ - if (_PyInterpreterState_GET()->eval_frame == NULL) { - buffer[pc].opcode = _NOP; - } - break; - } - case _PUSH_FRAME: - case _POP_FRAME: - { - co = get_co(&buffer[pc]); - break; - } - case _JUMP_TO_TOP: - case _EXIT_TRACE: - return; - } - } -} - // _PUSH_FRAME is 3 ops in front of _CHECK_STACK_SPACE #define _CHECK_STACK_TO_PUSH_FRAME_OFFSET 3 @@ -691,6 +655,42 @@ combine_stack_space_checks( Py_UNREACHABLE(); } +static void +peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) +{ + PyCodeObject *co = _PyFrame_GetCode(frame); + for (int pc = 0; pc < buffer_size; pc++) { + int opcode = buffer[pc].opcode; + switch(opcode) { + case _LOAD_CONST: { + assert(co != NULL); + PyObject *val = PyTuple_GET_ITEM(co->co_consts, buffer[pc].oparg); + buffer[pc].opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE; + buffer[pc].operand = (uintptr_t)val; + break; + } + case _CHECK_PEP_523: + { + /* Setting the eval frame function invalidates + * all executors, so no need to check dynamically */ + if (_PyInterpreterState_GET()->eval_frame == NULL) { + buffer[pc].opcode = _NOP; + } + break; + } + case _PUSH_FRAME: + case _POP_FRAME: + { + co = get_co(&buffer[pc]); + break; + } + case _JUMP_TO_TOP: + case _EXIT_TRACE: + return; + } + } +} + // 0 - failure, no error raised, just fall back to Tier 1 // -1 - failure, and raise error // > 0 - length of optimized trace From df0d5890ea0eb8cdb30699def05a2f0ce53b6866 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Thu, 28 Mar 2024 20:41:21 -0700 Subject: [PATCH 07/12] Update with PR feedback, still need to move logic to optimizer_bytecodes.c --- Lib/test/test_capi/test_opt.py | 28 -------- Python/bytecodes.c | 2 +- Python/executor_cases.c.h | 2 +- Python/optimizer_analysis.c | 125 ++++++++++++++++----------------- 4 files changed, 61 insertions(+), 96 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 8432147516ab6a..edcb76929526bc 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -579,14 +579,6 @@ def testfunc(n): @requires_specialization @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") class TestUopsOptimization(unittest.TestCase): - FRAMESIZE_ADJUSTMENT = 1 if platform.architecture()[0] == "32bit" else 0 - - def _assert_framesize_cross_platform(self, fn_obj, expected_framesize): - self.assertEqual( - _testinternalcapi.get_co_framesize(fn_obj.__code__), - expected_framesize + self.FRAMESIZE_ADJUSTMENT - ) - def _run_with_optimizer(self, testfunc, arg): res = None opt = _testinternalcapi.new_uop_optimizer() @@ -974,8 +966,6 @@ def testfunc(n): a += b + c + d return a - self._assert_framesize_cross_platform((dummy12), 12) - self._assert_framesize_cross_platform((dummy13), 13) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 832) @@ -1004,8 +994,6 @@ def testfunc(n): a += b + c return a - self._assert_framesize_cross_platform((dummy12), 12) - self._assert_framesize_cross_platform((dummy15), 15) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 224) @@ -1042,9 +1030,6 @@ def testfunc(n): a += b + c + d + e return a - self._assert_framesize_cross_platform((dummy12), 12) - self._assert_framesize_cross_platform((dummy13), 13) - self._assert_framesize_cross_platform((dummy18), 18) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 800) @@ -1082,9 +1067,6 @@ def testfunc(n): a += b + c + d + e return a - self._assert_framesize_cross_platform((dummy12), 12) - self._assert_framesize_cross_platform((dummy13), 13) - self._assert_framesize_cross_platform((dummy18), 18) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 800) @@ -1130,13 +1112,6 @@ def testfunc(n): a += b + c + d return a - self._assert_framesize_cross_platform(dummy0, 11) - self._assert_framesize_cross_platform(dummy1, 14) - self._assert_framesize_cross_platform(dummy2, 14) - self._assert_framesize_cross_platform(dummy3, 14) - self._assert_framesize_cross_platform(dummy4, 16) - self._assert_framesize_cross_platform(dummy5, 14) - self._assert_framesize_cross_platform(dummy6, 18) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 96) @@ -1198,8 +1173,6 @@ def testfunc(n): b += dummy15(7) return b - self._assert_framesize_cross_platform((dummy_large), repetitions + 12) - self._assert_framesize_cross_platform((dummy15), 15) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 32 * (repetitions + 9)) @@ -1235,7 +1208,6 @@ def testfunc(n): a += dummy15(n) return a - self._assert_framesize_cross_platform((dummy15), 15) res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 42 * 32) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index f035032f23c6d8..d150f8747d97b2 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4103,7 +4103,7 @@ dummy_func( } tier2 op(_CHECK_STACK_SPACE_OPERAND, (framesize/2 --)) { - assert(framesize < INT_MAX); + assert(framesize <= INT_MAX); DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, framesize)); DEOPT_IF(tstate->py_recursion_remaining <= 1); } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index c8a668d0c90b30..0a78dc490aca28 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3603,7 +3603,7 @@ case _CHECK_STACK_SPACE_OPERAND: { uint32_t framesize = (uint32_t)CURRENT_OPERAND(); - assert(framesize < INT_MAX); + assert(framesize <= INT_MAX); if (!_PyThreadState_HasStackSpace(tstate, framesize)) JUMP_TO_JUMP_TARGET(); if (tstate->py_recursion_remaining <= 1) JUMP_TO_JUMP_TARGET(); break; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 17cdf861d6b35a..37d66c3ba290d6 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -529,15 +529,13 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) } } } - Py_FatalError("No terminating instruction"); Py_UNREACHABLE(); } /* _PUSH_FRAME/_POP_FRAME's operand can be 0, a PyFunctionObject *, or a * PyCodeObject *. Retrieve the code object if possible. */ -static PyCodeObject * -get_co(_PyUOpInstruction *op) { +static PyCodeObject *get_co(_PyUOpInstruction *op) { assert(op->opcode == _PUSH_FRAME || op->opcode == _POP_FRAME); PyCodeObject *co = NULL; uint64_t operand = op->operand; @@ -556,13 +554,19 @@ get_co(_PyUOpInstruction *op) { return co; } -// _PUSH_FRAME is 3 ops in front of _CHECK_STACK_SPACE -#define _CHECK_STACK_TO_PUSH_FRAME_OFFSET 3 +/* Get the frame size from a _PUSH_FRAME/_POP_FRAME instruction, + * -1 if the information isn't available. + */ +static int get_framesize(_PyUOpInstruction *op) { + PyCodeObject *co = get_co(op); + if (co == NULL) { + return -1; + } + return co->co_framesize; +} /* Compute the maximum stack space needed for this trace and consolidate * all possible _CHECK_STACK_SPACE ops to one _CHECK_STACK_SPACE_OPERAND. - * This calculation is essentially a traversal over the tree of function - * calls in this trace. */ static void combine_stack_space_checks( @@ -570,89 +574,78 @@ combine_stack_space_checks( int buffer_size ) { -#ifdef Py_DEBUG - // assume there's a _PUSH_FRAME after every _CHECK_STACK_SPACE - bool expecting_push = false; - // if we ever can't get a framesize from _PUSH_FRAME, must exit soon - bool must_exit = false; -#endif - int depth = 0; - uint32_t space_at_depth[MAX_ABSTRACT_FRAME_DEPTH]; - uint32_t curr_space = 0; - uint32_t max_space = 0; + int curr_framesize = -1; + int curr_space = 0; + int max_space = 0; _PyUOpInstruction *first_valid_check_stack = NULL; + _PyUOpInstruction *corresponding_check_stack = NULL; for (int pc = 0; pc < buffer_size; pc++) { int opcode = buffer[pc].opcode; switch (opcode) { case _CHECK_STACK_SPACE: { -#ifdef Py_DEBUG - assert(expecting_push == false); - assert(must_exit == false); - expecting_push = true; -#endif - uint32_t framesize = 0; - if (pc + _CHECK_STACK_TO_PUSH_FRAME_OFFSET < buffer_size) { - _PyUOpInstruction *uop = &buffer[ - pc + _CHECK_STACK_TO_PUSH_FRAME_OFFSET - ]; - assert(uop->opcode == _PUSH_FRAME); - PyCodeObject *code = get_co(uop); - if (code != NULL) { - framesize = code->co_framesize; - } + assert(corresponding_check_stack == NULL); + corresponding_check_stack = &buffer[pc]; + break; + } + case _PUSH_FRAME: { + assert(corresponding_check_stack != NULL); + curr_framesize = get_framesize(&buffer[pc]); + if (curr_framesize == -1) { + goto finish; } - if (framesize == 0) { -#ifdef Py_DEBUG - must_exit = true; -#endif - break; + assert(curr_framesize > 0); + curr_space += curr_framesize; + if (curr_space < 0 || curr_space > UINT32_MAX) { + // overflow or won't fit in operand + goto finish; } + max_space = curr_space > max_space ? curr_space : max_space; if (first_valid_check_stack == NULL) { - first_valid_check_stack = &buffer[pc]; + first_valid_check_stack = corresponding_check_stack; } else { // delete all but the first valid _CHECK_STACK_SPACE - buffer[pc].opcode = _NOP; + corresponding_check_stack->opcode = _NOP; } - space_at_depth[depth] = framesize; - curr_space += framesize; - max_space = curr_space > max_space ? curr_space : max_space; - depth++; - assert(depth < MAX_ABSTRACT_FRAME_DEPTH); + corresponding_check_stack = NULL; break; } case _POP_FRAME: { -#ifdef Py_DEBUG - assert(expecting_push == false); - assert(must_exit == false); -#endif - depth--; - assert(depth >= 0); - curr_space -= space_at_depth[depth]; + assert(corresponding_check_stack == NULL); + assert(curr_framesize > 0); + assert(curr_framesize <= curr_space); + curr_space -= curr_framesize; + curr_framesize = get_framesize(&buffer[pc]); + assert(curr_framesize > 0); + if (curr_framesize == -1) { + goto finish; + } break; } + case _JUMP_TO_TOP: + case _EXIT_TRACE: { + goto finish; + } #ifdef Py_DEBUG - case _PUSH_FRAME: { - assert(expecting_push == true); - expecting_push = false; + case _CHECK_STACK_SPACE_OPERAND: { + /* We should never see _CHECK_STACK_SPACE_OPERANDs. + * They are only created at the end of this pass. */ + assert(false); break; } #endif - case _JUMP_TO_TOP: - case _EXIT_TRACE: { - if (first_valid_check_stack != NULL) { - assert(first_valid_check_stack->opcode == _CHECK_STACK_SPACE); - assert(max_space > 0); - assert(max_space < INT_MAX); - first_valid_check_stack->opcode = _CHECK_STACK_SPACE_OPERAND; - first_valid_check_stack->operand = max_space; - } - return; - } } } - Py_FatalError("No terminating instruction"); Py_UNREACHABLE(); +finish: + if (first_valid_check_stack != NULL) { + assert(first_valid_check_stack->opcode == _CHECK_STACK_SPACE); + assert(max_space > 0); + assert(max_space <= INT_MAX); + assert(max_space <= UINT32_MAX); + first_valid_check_stack->opcode = _CHECK_STACK_SPACE_OPERAND; + first_valid_check_stack->operand = max_space; + } } static void From c12119f9cde564f4a8d858c0ac1e6dea05a72348 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Fri, 29 Mar 2024 17:24:19 -0700 Subject: [PATCH 08/12] Make combine_stack_space_checks as similar to peephole_opt as possible --- Python/optimizer_analysis.c | 45 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 37d66c3ba290d6..db170b9b01a19d 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -554,27 +554,17 @@ static PyCodeObject *get_co(_PyUOpInstruction *op) { return co; } -/* Get the frame size from a _PUSH_FRAME/_POP_FRAME instruction, - * -1 if the information isn't available. - */ -static int get_framesize(_PyUOpInstruction *op) { - PyCodeObject *co = get_co(op); - if (co == NULL) { - return -1; - } - return co->co_framesize; -} - /* Compute the maximum stack space needed for this trace and consolidate * all possible _CHECK_STACK_SPACE ops to one _CHECK_STACK_SPACE_OPERAND. */ static void combine_stack_space_checks( + _PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size ) { - int curr_framesize = -1; + PyCodeObject *co = _PyFrame_GetCode(frame); int curr_space = 0; int max_space = 0; _PyUOpInstruction *first_valid_check_stack = NULL; @@ -589,13 +579,15 @@ combine_stack_space_checks( } case _PUSH_FRAME: { assert(corresponding_check_stack != NULL); - curr_framesize = get_framesize(&buffer[pc]); - if (curr_framesize == -1) { + co = get_co(&buffer[pc]); + if (co == NULL) { + // should be about to _EXIT_TRACE anyway goto finish; } - assert(curr_framesize > 0); - curr_space += curr_framesize; - if (curr_space < 0 || curr_space > UINT32_MAX) { + int framesize = co->co_framesize; + assert(framesize > 0); + curr_space += framesize; + if (curr_space < 0 || curr_space > INT32_MAX) { // overflow or won't fit in operand goto finish; } @@ -612,12 +604,14 @@ combine_stack_space_checks( } case _POP_FRAME: { assert(corresponding_check_stack == NULL); - assert(curr_framesize > 0); - assert(curr_framesize <= curr_space); - curr_space -= curr_framesize; - curr_framesize = get_framesize(&buffer[pc]); - assert(curr_framesize > 0); - if (curr_framesize == -1) { + assert(co != NULL); + int framesize = co->co_framesize; + assert(framesize > 0); + assert(framesize <= curr_space); + curr_space -= framesize; + co = get_co(&buffer[pc]); + if (co == NULL) { + // might be impossible, but bailing is still safe goto finish; } break; @@ -642,7 +636,7 @@ combine_stack_space_checks( assert(first_valid_check_stack->opcode == _CHECK_STACK_SPACE); assert(max_space > 0); assert(max_space <= INT_MAX); - assert(max_space <= UINT32_MAX); + assert(max_space <= INT32_MAX); first_valid_check_stack->opcode = _CHECK_STACK_SPACE_OPERAND; first_valid_check_stack->operand = max_space; } @@ -675,6 +669,7 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s case _POP_FRAME: { co = get_co(&buffer[pc]); + // if co is NULL, goto finish probably makes sense break; } case _JUMP_TO_TOP: @@ -704,6 +699,7 @@ _Py_uop_analyze_and_optimize( } peephole_opt(frame, buffer, length); + combine_stack_space_checks(frame, buffer, length); length = optimize_uops( _PyFrame_GetCode(frame), buffer, @@ -713,7 +709,6 @@ _Py_uop_analyze_and_optimize( return length; } - combine_stack_space_checks(buffer, length); length = remove_unneeded_uops(buffer, length); assert(length > 0); From f0b2133ef7a2282555bab9025266595aa03b0c2e Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Fri, 29 Mar 2024 17:38:55 -0700 Subject: [PATCH 09/12] Integrate combine_stack_space_checks with peephole_opt --- Python/optimizer_analysis.c | 70 ++++++++++--------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index db170b9b01a19d..6466816da8b011 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -554,15 +554,8 @@ static PyCodeObject *get_co(_PyUOpInstruction *op) { return co; } -/* Compute the maximum stack space needed for this trace and consolidate - * all possible _CHECK_STACK_SPACE ops to one _CHECK_STACK_SPACE_OPERAND. - */ static void -combine_stack_space_checks( - _PyInterpreterFrame *frame, - _PyUOpInstruction *buffer, - int buffer_size -) +peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) { PyCodeObject *co = _PyFrame_GetCode(frame); int curr_space = 0; @@ -571,7 +564,22 @@ combine_stack_space_checks( _PyUOpInstruction *corresponding_check_stack = NULL; for (int pc = 0; pc < buffer_size; pc++) { int opcode = buffer[pc].opcode; - switch (opcode) { + switch(opcode) { + case _LOAD_CONST: { + assert(co != NULL); + PyObject *val = PyTuple_GET_ITEM(co->co_consts, buffer[pc].oparg); + buffer[pc].opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE; + buffer[pc].operand = (uintptr_t)val; + break; + } + case _CHECK_PEP_523: { + /* Setting the eval frame function invalidates + * all executors, so no need to check dynamically */ + if (_PyInterpreterState_GET()->eval_frame == NULL) { + buffer[pc].opcode = _NOP; + } + break; + } case _CHECK_STACK_SPACE: { assert(corresponding_check_stack == NULL); corresponding_check_stack = &buffer[pc]; @@ -617,15 +625,13 @@ combine_stack_space_checks( break; } case _JUMP_TO_TOP: - case _EXIT_TRACE: { + case _EXIT_TRACE: goto finish; - } #ifdef Py_DEBUG case _CHECK_STACK_SPACE_OPERAND: { /* We should never see _CHECK_STACK_SPACE_OPERANDs. * They are only created at the end of this pass. */ - assert(false); - break; + Py_UNREACHABLE(); } #endif } @@ -642,43 +648,6 @@ combine_stack_space_checks( } } -static void -peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_size) -{ - PyCodeObject *co = _PyFrame_GetCode(frame); - for (int pc = 0; pc < buffer_size; pc++) { - int opcode = buffer[pc].opcode; - switch(opcode) { - case _LOAD_CONST: { - assert(co != NULL); - PyObject *val = PyTuple_GET_ITEM(co->co_consts, buffer[pc].oparg); - buffer[pc].opcode = _Py_IsImmortal(val) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE; - buffer[pc].operand = (uintptr_t)val; - break; - } - case _CHECK_PEP_523: - { - /* Setting the eval frame function invalidates - * all executors, so no need to check dynamically */ - if (_PyInterpreterState_GET()->eval_frame == NULL) { - buffer[pc].opcode = _NOP; - } - break; - } - case _PUSH_FRAME: - case _POP_FRAME: - { - co = get_co(&buffer[pc]); - // if co is NULL, goto finish probably makes sense - break; - } - case _JUMP_TO_TOP: - case _EXIT_TRACE: - return; - } - } -} - // 0 - failure, no error raised, just fall back to Tier 1 // -1 - failure, and raise error // > 0 - length of optimized trace @@ -699,7 +668,6 @@ _Py_uop_analyze_and_optimize( } peephole_opt(frame, buffer, length); - combine_stack_space_checks(frame, buffer, length); length = optimize_uops( _PyFrame_GetCode(frame), buffer, From 486ece65e7808c7c61e2b448bb8eac7e7a2831c5 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Mon, 1 Apr 2024 16:05:02 -0700 Subject: [PATCH 10/12] Address style nits --- Lib/test/test_capi/test_opt.py | 22 +++++++--------------- Modules/_testinternalcapi.c | 3 ++- Python/optimizer_analysis.c | 6 ++++-- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index edcb76929526bc..00111da01a5fd8 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -4,7 +4,6 @@ import unittest import gc import os -import platform import _opcode import _testinternalcapi @@ -966,12 +965,11 @@ def testfunc(n): a += b + c + d return a - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 832) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) self.assertEqual(uop_names.count("_POP_FRAME"), 2) @@ -994,12 +992,11 @@ def testfunc(n): a += b + c return a - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 224) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) self.assertEqual(uop_names.count("_POP_FRAME"), 2) @@ -1030,12 +1027,11 @@ def testfunc(n): a += b + c + d + e return a - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 800) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 4) self.assertEqual(uop_names.count("_POP_FRAME"), 4) @@ -1067,12 +1063,11 @@ def testfunc(n): a += b + c + d + e return a - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 800) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 4) self.assertEqual(uop_names.count("_POP_FRAME"), 4) @@ -1112,12 +1107,11 @@ def testfunc(n): a += b + c + d return a - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 96) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 15) self.assertEqual(uop_names.count("_POP_FRAME"), 15) @@ -1173,12 +1167,11 @@ def testfunc(n): b += dummy15(7) return b - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 32 * (repetitions + 9)) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) self.assertEqual(uop_names.count("_CHECK_STACK_SPACE_OPERAND"), 1) @@ -1208,12 +1201,11 @@ def testfunc(n): a += dummy15(n) return a - res, ex = self._run_with_optimizer(testfunc, 32) self.assertEqual(res, 42 * 32) self.assertIsNotNone(ex) - uops_and_operands = [(opcode, operand) for opcode, _, _, operand in list(ex)] + uops_and_operands = [(opcode, operand) for opcode, _, _, operand in ex] uop_names = [uop[0] for uop in uops_and_operands] self.assertEqual(uop_names.count("_PUSH_FRAME"), 2) self.assertEqual(uop_names.count("_POP_FRAME"), 0) diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 72d1441a3d9d9a..85c200e75af8e5 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -958,7 +958,8 @@ iframe_getlasti(PyObject *self, PyObject *frame) } static PyObject * -get_co_framesize(PyObject *self, PyObject *arg) { +get_co_framesize(PyObject *self, PyObject *arg) +{ if (!PyCode_Check(arg)) { PyErr_SetString(PyExc_TypeError, "argument must be a code object"); return NULL; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 6466816da8b011..a21679f366a74e 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -535,7 +535,9 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) /* _PUSH_FRAME/_POP_FRAME's operand can be 0, a PyFunctionObject *, or a * PyCodeObject *. Retrieve the code object if possible. */ -static PyCodeObject *get_co(_PyUOpInstruction *op) { +static PyCodeObject * +get_co(_PyUOpInstruction *op) +{ assert(op->opcode == _PUSH_FRAME || op->opcode == _POP_FRAME); PyCodeObject *co = NULL; uint64_t operand = op->operand; @@ -596,7 +598,7 @@ peephole_opt(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer, int buffer_s assert(framesize > 0); curr_space += framesize; if (curr_space < 0 || curr_space > INT32_MAX) { - // overflow or won't fit in operand + // won't fit in signed 32-bit int goto finish; } max_space = curr_space > max_space ? curr_space : max_space; From 64fac3835d99d56e0d5571b072f86933bd521c53 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Mon, 1 Apr 2024 17:35:33 -0700 Subject: [PATCH 11/12] Undo accidental newline removal --- Lib/test/test_capi/test_opt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 00111da01a5fd8..d0eacbf7e0f19d 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -578,6 +578,7 @@ def testfunc(n): @requires_specialization @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") class TestUopsOptimization(unittest.TestCase): + def _run_with_optimizer(self, testfunc, arg): res = None opt = _testinternalcapi.new_uop_optimizer() From 7b62c2be63e680b221ebcdb45a3acb664b2e5eb0 Mon Sep 17 00:00:00 2001 From: Peter Lazorchak Date: Wed, 3 Apr 2024 09:44:44 -0700 Subject: [PATCH 12/12] Fix variables names in large-framesize test comment --- Lib/test/test_capi/test_opt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index d0eacbf7e0f19d..ceb49c3c7129cb 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -1155,8 +1155,8 @@ def dummy_large(a0): # a1 = a0 + 1 # a2 = a1 + 1 # .... - # a99999 = a99998 + 1 - # return a99999 + # a9999 = a9998 + 1 + # return a9999 def dummy15(z): y = dummy_large(z)