-
-
Notifications
You must be signed in to change notification settings - Fork 32.1k
bpo-44187: Quickening infrastructure #26264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
7fae680
c192bf2
edce8de
2d8b14a
af5b90e
526be3b
6577457
bc7b418
b54feff
ffd6e87
8c12a0a
9e1a771
f0acdf0
8bd4487
d0ca916
ae520e5
e329b2e
9345959
2c06ed4
2d4e416
39b3a93
ee2dae1
12078f0
16b985d
8ea4b85
55e673b
b7c3995
ecfa62b
ec50298
ab3a30b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,9 @@ | |
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
|
||
/* Legacy Opcache */ | ||
|
||
typedef struct { | ||
PyObject *ptr; /* Cached pointer (borrowed reference) */ | ||
uint64_t globals_ver; /* ma_version of global dict */ | ||
|
@@ -24,6 +26,84 @@ struct _PyOpcache { | |
char optimized; | ||
}; | ||
|
||
|
||
/* PEP 659 | ||
* Specialization and quickening structs and helper functions | ||
*/ | ||
|
||
typedef struct { | ||
int32_t cache_count; | ||
int32_t _; | ||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} EntryZero; | ||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/* Add specialized versions of entries to this union. | ||
* Do not break this invariant: sizeof(HotPyCacheEntry) == 8 */ | ||
typedef union { | ||
EntryZero zero; | ||
PyObject *object; | ||
} HotPyCacheEntry; | ||
|
||
#define INSTRUCTIONS_PER_ENTRY (sizeof(HotPyCacheEntry)/sizeof(_Py_CODEUNIT)) | ||
|
||
/* Maximum size of code to quicken */ | ||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#define MAX_SIZE_TO_QUICKEN 5000 | ||
|
||
typedef union _hotpy_quickened { | ||
_Py_CODEUNIT code[1]; | ||
HotPyCacheEntry entry; | ||
} HotPyCacheOrInstruction; | ||
|
||
static inline HotPyCacheEntry * | ||
_HotPy_GetCacheEntry(_Py_CODEUNIT *first_instr, Py_ssize_t index) | ||
{ | ||
HotPyCacheOrInstruction *last_cache_plus_one = (HotPyCacheOrInstruction *)first_instr; | ||
assert(&last_cache_plus_one->code[0] == first_instr); | ||
return &last_cache_plus_one[-1-index].entry; | ||
} | ||
|
||
/* Following two functions determine the index of a cache entry from the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment doesn't seem correct - they take the index and return oparg or offset. |
||
* instruction index (in the instruction array) and the oparg. | ||
* oparg_from_offset_and_index must be the inverse of | ||
* offset_from_oparg_and_index | ||
*/ | ||
|
||
static inline int | ||
oparg_from_offset_and_index(int offset, int index) | ||
{ | ||
return offset-(index>>1); | ||
} | ||
|
||
static inline int | ||
offset_from_oparg_and_index(int oparg, int index) | ||
{ | ||
return (index>>1)+oparg; | ||
} | ||
|
||
static inline HotPyCacheEntry * | ||
_HotPy_GetCacheEntryForInstruction(_Py_CODEUNIT *first_instr, int index, int oparg) | ||
{ | ||
return _HotPy_GetCacheEntry( | ||
first_instr, | ||
offset_from_oparg_and_index(oparg, index) | ||
); | ||
} | ||
|
||
#define HOTPY_INITIAL_WARMUP 8 | ||
|
||
static inline void | ||
PyCodeObject_IncrementWarmup(PyCodeObject * co) | ||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
co->co_warmup--; | ||
} | ||
|
||
static inline int | ||
PyCodeObject_IsWarmedUp(PyCodeObject * co) | ||
{ | ||
return (co->co_warmup == 0); | ||
} | ||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
int _Py_Quicken(PyCodeObject *code); | ||
|
||
/* Private API */ | ||
int _PyCode_InitOpcache(PyCodeObject *co); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Implement quickening in the interpreter. This offers no advantages in | ||
itself, but is an enabler of future optimizations. See PEP 659 for full | ||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
explanation. |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,135 @@ | ||||||
|
||||||
#define PY_LOCAL_AGGRESSIVE | ||||||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
#include "Python.h" | ||||||
#include "pycore_code.h" | ||||||
|
||||||
|
||||||
/* We layout the quickened data as a bi-directional array: | ||||||
* Instructions upwards, cache entries downwards. | ||||||
* first_instr is aligned to at a HotPyCacheEntry. | ||||||
* The nth instruction is located at first_instr[n] | ||||||
* The nth cache is is located at ((HotPyCacheEntry *)first_instr)[-1-n] | ||||||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
* The first cache [-count] is reserved for the count, to enable finding | ||||||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
* the first instruction from the base pointer. | ||||||
gvanrossum marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
* We need to use the HotPyCacheOrInstruction union to refer to the data | ||||||
* so as not to break aliasing rules. | ||||||
*/ | ||||||
|
||||||
static HotPyCacheOrInstruction * | ||||||
allocate(int cache_count, int instruction_count) | ||||||
{ | ||||||
assert(sizeof(HotPyCacheOrInstruction) == sizeof(void *)); | ||||||
assert(sizeof(HotPyCacheEntry) == sizeof(void *)); | ||||||
markshannon marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
assert(cache_count > 0); | ||||||
gvanrossum marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
int count = cache_count + (instruction_count + INSTRUCTIONS_PER_ENTRY -1)/INSTRUCTIONS_PER_ENTRY; | ||||||
HotPyCacheOrInstruction *array = (HotPyCacheOrInstruction *) | ||||||
PyMem_Malloc(sizeof(HotPyCacheOrInstruction) * count); | ||||||
if (array == NULL) { | ||||||
PyErr_NoMemory(); | ||||||
return NULL; | ||||||
} | ||||||
array[0].entry.zero.cache_count = cache_count; | ||||||
return array; | ||||||
} | ||||||
|
||||||
static int | ||||||
get_cache_count(HotPyCacheOrInstruction *quickened) { | ||||||
return quickened[0].entry.zero.cache_count; | ||||||
} | ||||||
|
||||||
static uint8_t adaptive[256] = { 0 }; | ||||||
|
||||||
static uint8_t cache_requirements[256] = { 0 }; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These variables could use some comment explaining their purpose. (Also, maybe we should plan to generate these from info added to opcode.py, like opcode_targets.h? |
||||||
|
||||||
static int | ||||||
entries_needed(_Py_CODEUNIT *code, int len) | ||||||
{ | ||||||
int cache_offset = 0; | ||||||
for (int i = 0; i < len; i++) { | ||||||
uint8_t opcode = _Py_OPCODE(code[i]); | ||||||
uint8_t need = cache_requirements[opcode]; | ||||||
if (need == 0) { | ||||||
continue; | ||||||
} | ||||||
assert(adaptive[opcode] != 0); | ||||||
int oparg = oparg_from_offset_and_index(cache_offset, i); | ||||||
assert(cache_offset == offset_from_oparg_and_index(oparg, i)); | ||||||
if (oparg < 0) { | ||||||
cache_offset = i/2; | ||||||
} | ||||||
else if (oparg > 255) { | ||||||
/* Cannot access required cache_offset */ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe in some kind of debug mode it would be nice to report whether this happens at all? If we see this frequently we need to change the strategy. OTOH maybe we never expect it and we could put assert(0) here??? |
||||||
continue; | ||||||
} | ||||||
cache_offset += need; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this will over-count if there are eligible opcodes with an |
||||||
} | ||||||
return cache_offset+1; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
|
||||||
|
||||||
static inline _Py_CODEUNIT * | ||||||
first_instruction(HotPyCacheOrInstruction *quickened) | ||||||
{ | ||||||
return &quickened[get_cache_count(quickened)].code[0]; | ||||||
} | ||||||
|
||||||
static void | ||||||
optimize(HotPyCacheOrInstruction *quickened, int len) | ||||||
{ | ||||||
_Py_CODEUNIT *instructions = first_instruction(quickened); | ||||||
int cache_offset = 0; | ||||||
int previous_opcode = -1; | ||||||
for(int i = 0; i < len; i++) { | ||||||
gvanrossum marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
int opcode = _Py_OPCODE(instructions[i]); | ||||||
uint8_t adaptive_opcode = adaptive[opcode]; | ||||||
if (adaptive_opcode) { | ||||||
int oparg = oparg_from_offset_and_index(cache_offset, i); | ||||||
if (oparg < 0) { | ||||||
cache_offset = i/2; | ||||||
oparg = 0; | ||||||
} | ||||||
if (oparg < 256) { | ||||||
instructions[i] = _Py_INSTRUCTION(adaptive_opcode, oparg); | ||||||
cache_offset += cache_requirements[opcode]; | ||||||
} | ||||||
} | ||||||
else { | ||||||
switch (opcode) { | ||||||
/* Insert superinstructions here | ||||||
E.g. | ||||||
case LOAD_FAST: | ||||||
if (previous_opcode == LOAD_FAST) | ||||||
gvanrossum marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
instructions[i-1] = _Py_INSTRUCTION(LOAD_FAST__LOAD_FAST, oparg); | ||||||
*/ | ||||||
} | ||||||
} | ||||||
previous_opcode = opcode; | ||||||
} | ||||||
assert(cache_offset+1 == get_cache_count(quickened)); | ||||||
} | ||||||
|
||||||
int | ||||||
_Py_Quicken(PyCodeObject *code) { | ||||||
Py_ssize_t size = PyBytes_Size(code->co_code); | ||||||
isidentical marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
int instr_count = (int)(size/sizeof(_Py_CODEUNIT)); | ||||||
if (instr_count > MAX_SIZE_TO_QUICKEN) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be possible instead to quicken the first 5000 instructions and then exit? (That would avoid a cliff where a minor change in the code tips it over the no-optimization limit and makes it run much slower.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a memory cost to creating a duplicate code array, so it would risk wasting memory unproportionally. |
||||||
code->co_warmup = 255; | ||||||
return 0; | ||||||
} | ||||||
if (code->co_quickened) { | ||||||
return 0; | ||||||
} | ||||||
int entry_count = entries_needed(code->co_firstinstr, instr_count); | ||||||
HotPyCacheOrInstruction *quickened = allocate(entry_count, instr_count); | ||||||
if (quickened == NULL) { | ||||||
return -1; | ||||||
} | ||||||
_Py_CODEUNIT *new_instructions = first_instruction(quickened); | ||||||
memcpy(new_instructions, code->co_firstinstr, size); | ||||||
optimize(quickened, instr_count); | ||||||
code->co_quickened = quickened; | ||||||
code->co_firstinstr = new_instructions; | ||||||
return 0; | ||||||
} | ||||||
|
Uh oh!
There was an error while loading. Please reload this page.