From 4abdaa16f0e0593c7680e9acea401f8e6d1a4710 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 2 Oct 2025 18:29:42 +0100 Subject: [PATCH 1/4] Implement assembler optimization for AArch64. Removes a number of unnecessary branches. Change-Id: I4965fe8d8b79f7d859ff9076d9c53f3ac7f094b2 --- Python/jit.c | 43 +++++++++++++++++++++++++++++++++++----- Tools/jit/_optimizers.py | 42 +++++++++++++++++++++++++++++++++++++++ Tools/jit/_schema.py | 2 ++ Tools/jit/_stencils.py | 2 ++ 4 files changed, 84 insertions(+), 5 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 01ec9c1fa6e8a9..a557f08701a6c1 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -167,11 +167,13 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, // See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions // for instruction encodings: -#define IS_AARCH64_ADD_OR_SUB(I) (((I) & 0x11C00000) == 0x11000000) -#define IS_AARCH64_ADRP(I) (((I) & 0x9F000000) == 0x90000000) -#define IS_AARCH64_BRANCH(I) (((I) & 0x7C000000) == 0x14000000) -#define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) -#define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) +#define IS_AARCH64_ADD_OR_SUB(I) (((I) & 0x11C00000) == 0x11000000) +#define IS_AARCH64_ADRP(I) (((I) & 0x9F000000) == 0x90000000) +#define IS_AARCH64_BRANCH(I) (((I) & 0x7C000000) == 0x14000000) +#define IS_AARCH64_BRANCH_COND(I) (((I) & 0x7C000000) == 0x54000000) +#define IS_AARCH64_TEST_AND_BRANCH(I) (((I) & 0x7E000000) == 0x36000000) +#define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) +#define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) // LLD is a great reference for performing relocations... just keep in // mind that Tools/jit/build.py does filtering and preprocessing for us! @@ -332,6 +334,37 @@ patch_aarch64_21rx(unsigned char *location, uint64_t value) patch_aarch64_21r(location, value); } + +// 21-bit relative branch. +void +patch_aarch64_19r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_BRANCH_COND(*loc32)); + value -= (uintptr_t)location; + // Check that we're not out of range of 21 signed bits: + assert((int64_t)value >= -(1 << 20)); + assert((int64_t)value < (1 << 20)); + // Since instructions are 4-byte aligned, only use 19 bits: + assert(get_bits(value, 0, 2) == 0); + set_bits(loc32, 5, value, 2, 19); +} + +// 16-bit relative branch. +void +patch_aarch64_14r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_TEST_AND_BRANCH(*loc32)); + value -= (uintptr_t)location; + // Check that we're not out of range of 16 signed bits: + assert((int64_t)value >= -(1 << 15)); + assert((int64_t)value < (1 << 15)); + // Since instructions are 4-byte aligned, only use 14 bits: + assert(get_bits(value, 0, 2) == 0); + set_bits(loc32, 5, value, 2, 14); +} + // 28-bit relative branch. void patch_aarch64_26r(unsigned char *location, uint64_t value) diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py index 33db110b728dba..780c5aa4c2c7b4 100644 --- a/Tools/jit/_optimizers.py +++ b/Tools/jit/_optimizers.py @@ -39,6 +39,41 @@ # Update with all of the inverted branches, too: _X86_BRANCHES |= {v: k for k, v in _X86_BRANCHES.items() if v} +_AARCH64_COND_CODES = { + # https://developer.arm.com/documentation/dui0801/b/CJAJIHAD?lang=en + "eq": "ne", + "ne": "eq", + "lt": "ge", + "ge": "lt", + "gt": "le", + "le": "gt", + "vs": "vc", + "vc": "vs", + "mi": "pl", + "pl": "mi", + "cs": "cc", + "cc": "cs", + "hs": "lo", + "lo": "hs", + "hi": "ls", + "ls": "hi", +} +# Branches are either b.{cond} or bc.{cond} +_AARCH64_BRANCHES = { + "b." + cond: ("b." + inverse if inverse else None) + for (cond, inverse) in _AARCH64_COND_CODES.items() +} | { + "bc." + cond: ("bc." + inverse if inverse else None) + for (cond, inverse) in _AARCH64_COND_CODES.items() +} +# And four compare and branch instructions +_AARCH64_BRANCHES |= { + "cbz": "cbnz", + "cbnz": "cbz", + "tbz": "tbnz", + "tbnz": "tbz", +} + @dataclasses.dataclass class _Block: @@ -286,8 +321,15 @@ def run(self) -> None: class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods """aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu""" + _branches = _AARCH64_BRANCHES + _re_branch = re.compile( + rf"\s*(?P{'|'.join(_AARCH64_BRANCHES)})\s+(.+,\s+)*(?P[\w.]+)" + ) + # https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch- _re_jump = re.compile(r"\s*b\s+(?P[\w.]+)") + # https://developer.arm.com/documentation/ddi0602/2025-09/Base-Instructions/RET--Return-from-subroutine- + _re_return = re.compile(r"\s*ret\b") class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index 228fc389584dd7..54ca662958ff22 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -20,6 +20,7 @@ "R_AARCH64_ADR_GOT_PAGE", "R_AARCH64_ADR_PREL_PG_HI21", "R_AARCH64_CALL26", + "R_AARCH64_CONDBR19", "R_AARCH64_JUMP26", "R_AARCH64_ADD_ABS_LO12_NC", "R_AARCH64_LD64_GOT_LO12_NC", @@ -27,6 +28,7 @@ "R_AARCH64_MOVW_UABS_G1_NC", "R_AARCH64_MOVW_UABS_G2_NC", "R_AARCH64_MOVW_UABS_G3", + "R_AARCH64_TSTBR14", "R_X86_64_64", "R_X86_64_GOTPCREL", "R_X86_64_GOTPCRELX", diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 14606b036db519..14aa479b4686d9 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -74,12 +74,14 @@ class HoleValue(enum.Enum): "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", "R_AARCH64_ADR_PREL_PG_HI21": "patch_aarch64_21r", "R_AARCH64_CALL26": "patch_aarch64_26r", + "R_AARCH64_CONDBR19": "patch_aarch64_19r", "R_AARCH64_JUMP26": "patch_aarch64_26r", "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12x", "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d", + "R_AARCH64_TSTBR14": "patch_aarch64_14r", # x86_64-unknown-linux-gnu: "R_X86_64_64": "patch_64", "R_X86_64_GOTPCRELX": "patch_x86_64_32rx", From b39e8759e7fe4adab52cc34329c4f9f10cb96a7c Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 10 Oct 2025 11:42:33 +0100 Subject: [PATCH 2/4] AArch64: Don't optimize 14 bit jumps to avoid failures when patching. Add Windows 19 bit jump relocation. Change-Id: Ib15ce7ab0de315c4dc1f544d600ae669024a4a6d --- Tools/jit/_optimizers.py | 7 ------- Tools/jit/_schema.py | 2 +- Tools/jit/_stencils.py | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py index 780c5aa4c2c7b4..c950cb4908f39b 100644 --- a/Tools/jit/_optimizers.py +++ b/Tools/jit/_optimizers.py @@ -66,13 +66,6 @@ "bc." + cond: ("bc." + inverse if inverse else None) for (cond, inverse) in _AARCH64_COND_CODES.items() } -# And four compare and branch instructions -_AARCH64_BRANCHES |= { - "cbz": "cbnz", - "cbnz": "cbz", - "tbz": "tbnz", - "tbnz": "tbz", -} @dataclasses.dataclass diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py index 54ca662958ff22..c47e9af924a20e 100644 --- a/Tools/jit/_schema.py +++ b/Tools/jit/_schema.py @@ -10,6 +10,7 @@ "ARM64_RELOC_PAGEOFF12", "ARM64_RELOC_UNSIGNED", "IMAGE_REL_AMD64_REL32", + "IMAGE_REL_ARM64_BRANCH19", "IMAGE_REL_ARM64_BRANCH26", "IMAGE_REL_ARM64_PAGEBASE_REL21", "IMAGE_REL_ARM64_PAGEOFFSET_12A", @@ -28,7 +29,6 @@ "R_AARCH64_MOVW_UABS_G1_NC", "R_AARCH64_MOVW_UABS_G2_NC", "R_AARCH64_MOVW_UABS_G3", - "R_AARCH64_TSTBR14", "R_X86_64_64", "R_X86_64_GOTPCREL", "R_X86_64_GOTPCRELX", diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 14aa479b4686d9..16bc1ea4e17e6b 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -61,6 +61,7 @@ class HoleValue(enum.Enum): # x86_64-pc-windows-msvc: "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", # aarch64-pc-windows-msvc: + "IMAGE_REL_ARM64_BRANCH19": "patch_aarch64_19r", "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", @@ -81,7 +82,6 @@ class HoleValue(enum.Enum): "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d", - "R_AARCH64_TSTBR14": "patch_aarch64_14r", # x86_64-unknown-linux-gnu: "R_X86_64_64": "patch_64", "R_X86_64_GOTPCRELX": "patch_x86_64_32rx", From 87015f3bbf310151ec8a16d18a66005fc7f851cd Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 10 Oct 2025 12:48:34 +0100 Subject: [PATCH 3/4] Add new relocation to match statement for Windows --- Tools/jit/_targets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 2f3969e7d0540c..d723916be8500f 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -324,7 +324,8 @@ def _handle_relocation( "Offset": offset, "Symbol": s, "Type": { - "Name": "IMAGE_REL_ARM64_BRANCH26" + "Name": "IMAGE_REL_ARM64_BRANCH19" + | "IMAGE_REL_ARM64_BRANCH26" | "IMAGE_REL_ARM64_PAGEBASE_REL21" | "IMAGE_REL_ARM64_PAGEOFFSET_12A" | "IMAGE_REL_ARM64_PAGEOFFSET_12L" as kind From 80ea07aa5cc131370c18aadae84b2ac5fd02076e Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Wed, 15 Oct 2025 10:40:25 +0100 Subject: [PATCH 4/4] Mach-O does not support 19 bit branches --- Python/jit.c | 16 ---------------- Tools/jit/_optimizers.py | 10 +++++++++- Tools/jit/_targets.py | 2 +- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index a557f08701a6c1..ebd0d90385e002 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -334,7 +334,6 @@ patch_aarch64_21rx(unsigned char *location, uint64_t value) patch_aarch64_21r(location, value); } - // 21-bit relative branch. void patch_aarch64_19r(unsigned char *location, uint64_t value) @@ -350,21 +349,6 @@ patch_aarch64_19r(unsigned char *location, uint64_t value) set_bits(loc32, 5, value, 2, 19); } -// 16-bit relative branch. -void -patch_aarch64_14r(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_TEST_AND_BRANCH(*loc32)); - value -= (uintptr_t)location; - // Check that we're not out of range of 16 signed bits: - assert((int64_t)value >= -(1 << 15)); - assert((int64_t)value < (1 << 15)); - // Since instructions are 4-byte aligned, only use 14 bits: - assert(get_bits(value, 0, 2) == 0); - set_bits(loc32, 5, value, 2, 14); -} - // 28-bit relative branch. void patch_aarch64_26r(unsigned char *location, uint64_t value) diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py index c950cb4908f39b..866417398b0ba5 100644 --- a/Tools/jit/_optimizers.py +++ b/Tools/jit/_optimizers.py @@ -311,8 +311,16 @@ def run(self) -> None: self.path.write_text(self._body()) +# Mach-O does not support the 19 bit branch locations needed for branch reordering +class OptimizerAArch64_MachO(Optimizer): # pylint: disable = too-few-public-methods + """aarch64-apple-darwin""" + + # https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch- + _re_jump = re.compile(r"\s*b\s+(?P[\w.]+)") + + class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods - """aarch64-apple-darwin/aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu""" + """aarch64-pc-windows-msvc/aarch64-unknown-linux-gnu""" _branches = _AARCH64_BRANCHES _re_branch = re.compile( diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index d62a86b977ca38..7ff7c4fba49652 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -565,7 +565,7 @@ def get_target(host: str) -> _COFF32 | _COFF64 | _ELF | _MachO: if re.fullmatch(r"aarch64-apple-darwin.*", host): host = "aarch64-apple-darwin" condition = "defined(__aarch64__) && defined(__APPLE__)" - optimizer = _optimizers.OptimizerAArch64 + optimizer = _optimizers.OptimizerAArch64_MachO target = _MachO(host, condition, optimizer=optimizer) elif re.fullmatch(r"aarch64-pc-windows-msvc", host): host = "aarch64-pc-windows-msvc"