Skip to content

bpo-44187: Quickening infrastructure #26264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7fae680
Add co_firstinstr field to code object.
markshannon May 17, 2021
c192bf2
Implement barebones quickening.
markshannon May 19, 2021
edce8de
Cleanup quickening internal API and use non-quickened bytecode when t…
markshannon May 20, 2021
2d8b14a
Tweak internal quickening API.
markshannon May 20, 2021
af5b90e
Flesh out superinstruction insertion code a bit.
markshannon May 20, 2021
526be3b
Add NEWS item
markshannon May 20, 2021
6577457
Add new file to Windows build.
markshannon May 20, 2021
bc7b418
Fix up public symbol.
markshannon May 20, 2021
b54feff
Tweaks
markshannon May 21, 2021
ffd6e87
Merge branch 'main' into quickening-infrastructure
markshannon May 21, 2021
8c12a0a
Clarify commments, fix assertions and switch sign of counter represen…
markshannon May 24, 2021
9e1a771
Remove 'HotPy' prefixes.
markshannon May 24, 2021
f0acdf0
Add more explanatory comments and rename a few macros for clarity.
markshannon May 25, 2021
8bd4487
Don't specialize instructions with EXTENDED_ARG.
markshannon May 25, 2021
d0ca916
Convert tracing dispatch to macro to ease keeping it in sync with nor…
markshannon May 25, 2021
ae520e5
Rename macro to avoid name clash.
markshannon May 26, 2021
e329b2e
Clarify and refactor quickening code. Account for EXTENDED_ARGs when …
markshannon May 27, 2021
9345959
Merge branch 'main' into quickening-infrastructure
markshannon May 27, 2021
2c06ed4
Move more heavily used fields of code object to front.
markshannon May 27, 2021
2d4e416
Fix more typos
markshannon May 27, 2021
39b3a93
Merge branch 'main' into quickening-infrastructure
markshannon May 27, 2021
ee2dae1
Make means of offset calculation explicit.
markshannon May 28, 2021
12078f0
Merge branch 'main' into quickening-infrastructure
markshannon May 28, 2021
16b985d
Add more explanatory comments.
markshannon Jun 1, 2021
8ea4b85
Use index+1 for speed and initialize adaptive cache entry when perfor…
markshannon Jun 1, 2021
55e673b
Add comment
markshannon Jun 1, 2021
b7c3995
Make sure that the uses of instruction index versus uses of nexti (in…
markshannon Jun 1, 2021
ecfa62b
Merge branch 'main' into quickening-infrastructure
markshannon Jun 4, 2021
ec50298
Merge branch 'main' into quickening-infrastructure
markshannon Jun 6, 2021
ab3a30b
Fix refleaks tests to account for quickened blocks
markshannon Jun 7, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Include/cpython/code.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ typedef uint16_t _Py_CODEUNIT;
#ifdef WORDS_BIGENDIAN
# define _Py_OPCODE(word) ((word) >> 8)
# define _Py_OPARG(word) ((word) & 255)
# define _Py_INSTRUCTION(opcode, oparg) (((opcode)<<8)|(oparg))
#else
# define _Py_OPCODE(word) ((word) & 255)
# define _Py_OPARG(word) ((word) >> 8)
# define _Py_INSTRUCTION(opcode, oparg) ((opcode)|((oparg)<<8))
#endif

typedef struct _PyOpcache _PyOpcache;
Expand All @@ -24,6 +26,7 @@ struct PyCodeObject {
int co_stacksize; /* #entries needed for evaluation stack */
int co_flags; /* CO_..., see below */
int co_firstlineno; /* first source line number */
_Py_CODEUNIT *co_firstinstr; /* Pointer to first instruction, used for quickening */
PyObject *co_code; /* instruction opcodes */
PyObject *co_consts; /* list (constants used) */
PyObject *co_names; /* list of strings (names used) */
Expand All @@ -47,6 +50,7 @@ struct PyCodeObject {
Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */
void *co_extra;
union _hotpy_quickened *co_quickened;

/* Per opcodes just-in-time cache
*
Expand All @@ -62,6 +66,7 @@ struct PyCodeObject {
_PyOpcache *co_opcache;
int co_opcache_flag; // used to determine when create a cache.
unsigned char co_opcache_size; // length of co_opcache.
unsigned char co_warmup;
};

/* Masks for co_flags above */
Expand Down
82 changes: 81 additions & 1 deletion Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#ifdef __cplusplus
extern "C" {
#endif


/* Legacy Opcache */

typedef struct {
PyObject *ptr; /* Cached pointer (borrowed reference) */
uint64_t globals_ver; /* ma_version of global dict */
Expand All @@ -24,6 +26,84 @@ struct _PyOpcache {
char optimized;
};


/* PEP 659
* Specialization and quickening structs and helper functions
*/

typedef struct {
int32_t cache_count;
int32_t _;
} EntryZero;

/* Add specialized versions of entries to this union.
* Do not break this invariant: sizeof(HotPyCacheEntry) == 8 */
typedef union {
EntryZero zero;
PyObject *object;
} HotPyCacheEntry;

#define INSTRUCTIONS_PER_ENTRY (sizeof(HotPyCacheEntry)/sizeof(_Py_CODEUNIT))

/* Maximum size of code to quicken */
#define MAX_SIZE_TO_QUICKEN 5000

typedef union _hotpy_quickened {
_Py_CODEUNIT code[1];
HotPyCacheEntry entry;
} HotPyCacheOrInstruction;

static inline HotPyCacheEntry *
_HotPy_GetCacheEntry(_Py_CODEUNIT *first_instr, Py_ssize_t index)
{
HotPyCacheOrInstruction *last_cache_plus_one = (HotPyCacheOrInstruction *)first_instr;
assert(&last_cache_plus_one->code[0] == first_instr);
return &last_cache_plus_one[-1-index].entry;
}

/* Following two functions determine the index of a cache entry from the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment doesn't seem correct - they take the index and return oparg or offset.

* instruction index (in the instruction array) and the oparg.
* oparg_from_offset_and_index must be the inverse of
* offset_from_oparg_and_index
*/

static inline int
oparg_from_offset_and_index(int offset, int index)
{
return offset-(index>>1);
}

static inline int
offset_from_oparg_and_index(int oparg, int index)
{
return (index>>1)+oparg;
}

static inline HotPyCacheEntry *
_HotPy_GetCacheEntryForInstruction(_Py_CODEUNIT *first_instr, int index, int oparg)
{
return _HotPy_GetCacheEntry(
first_instr,
offset_from_oparg_and_index(oparg, index)
);
}

#define HOTPY_INITIAL_WARMUP 8

static inline void
PyCodeObject_IncrementWarmup(PyCodeObject * co)
{
co->co_warmup--;
}

static inline int
PyCodeObject_IsWarmedUp(PyCodeObject * co)
{
return (co->co_warmup == 0);
}

int _Py_Quicken(PyCodeObject *code);

/* Private API */
int _PyCode_InitOpcache(PyCodeObject *co);

Expand Down
1 change: 1 addition & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ PYTHON_OBJS= \
Python/pythonrun.o \
Python/pytime.o \
Python/bootstrap_hash.o \
Python/specialize.o \
Python/structmember.o \
Python/symtable.o \
Python/sysmodule.o \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Implement quickening in the interpreter. This offers no advantages in
itself, but is an enabler of future optimizations. See PEP 659 for full
explanation.
9 changes: 9 additions & 0 deletions Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,12 @@ PyCode_NewWithPosOnlyArgs(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_nlocals = nlocals;
co->co_stacksize = stacksize;
co->co_flags = flags;
assert(PyBytes_GET_SIZE(code) <= INT_MAX);
assert(PyBytes_GET_SIZE(code) % sizeof(_Py_CODEUNIT) == 0);
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(code), sizeof(_Py_CODEUNIT)));
Py_INCREF(code);
co->co_code = code;
co->co_firstinstr = (_Py_CODEUNIT *)PyBytes_AS_STRING(code);
Py_INCREF(consts);
co->co_consts = consts;
Py_INCREF(names);
Expand Down Expand Up @@ -271,6 +275,8 @@ PyCode_NewWithPosOnlyArgs(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_opcache = NULL;
co->co_opcache_flag = 0;
co->co_opcache_size = 0;
co->co_warmup = HOTPY_INITIAL_WARMUP;
co->co_quickened = NULL;
return co;
}

Expand Down Expand Up @@ -678,6 +684,9 @@ code_dealloc(PyCodeObject *co)
PyObject_GC_Del(co->co_zombieframe);
if (co->co_weakreflist != NULL)
PyObject_ClearWeakRefs((PyObject*)co);
if (co->co_quickened) {
PyMem_Free(co->co_quickened);
}
PyObject_Free(co);
}

Expand Down
1 change: 1 addition & 0 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@
<ClCompile Include="..\Python\dtoa.c" />
<ClCompile Include="..\Python\Python-ast.c" />
<ClCompile Include="..\Python\pythonrun.c" />
<ClCompile Include="..\Python\specialize.c" />
<ClCompile Include="..\Python\suggestions.c" />
<ClCompile Include="..\Python\structmember.c" />
<ClCompile Include="..\Python\symtable.c" />
Expand Down
3 changes: 3 additions & 0 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,9 @@
<ClCompile Include="..\Python\pythonrun.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Python\specialize.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Python\structmember.c">
<Filter>Python</Filter>
</ClCompile>
Expand Down
22 changes: 16 additions & 6 deletions Python/ceval.c
Original file line number Diff line number Diff line change
Expand Up @@ -1639,15 +1639,21 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
if (PyDTrace_FUNCTION_ENTRY_ENABLED())
dtrace_function_entry(f);

if (!PyCodeObject_IsWarmedUp(co)) {
PyCodeObject_IncrementWarmup(co);
if (PyCodeObject_IsWarmedUp(co)) {
if (_Py_Quicken(co)) {
goto exit_eval_frame;
}
}
}


names = co->co_names;
consts = co->co_consts;
fastlocals = f->f_localsplus;
freevars = f->f_localsplus + co->co_nlocals;
assert(PyBytes_Check(co->co_code));
assert(PyBytes_GET_SIZE(co->co_code) <= INT_MAX);
assert(PyBytes_GET_SIZE(co->co_code) % sizeof(_Py_CODEUNIT) == 0);
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(co->co_code), sizeof(_Py_CODEUNIT)));
first_instr = (_Py_CODEUNIT *) PyBytes_AS_STRING(co->co_code);
first_instr = co->co_firstinstr;
/*
f->f_lasti refers to the index of the last instruction,
unless it's -1 in which case next_instr should be first_instr.
Expand Down Expand Up @@ -1752,7 +1758,11 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)

tracing_dispatch:
f->f_lasti = INSTR_OFFSET();
NEXTOPARG();
/* Get opcode and opcode from original instructions, not quickened form. */
_Py_CODEUNIT inst = ((_Py_CODEUNIT *)PyBytes_AS_STRING(co->co_code))[INSTR_OFFSET()];
opcode = _Py_OPCODE(inst);
oparg = _Py_OPARG(inst);
next_instr++;

if (PyDTrace_LINE_ENABLED())
maybe_dtrace_line(f, &trace_info);
Expand Down
135 changes: 135 additions & 0 deletions Python/specialize.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@

#define PY_LOCAL_AGGRESSIVE

#include "Python.h"
#include "pycore_code.h"


/* We layout the quickened data as a bi-directional array:
* Instructions upwards, cache entries downwards.
* first_instr is aligned to at a HotPyCacheEntry.
* The nth instruction is located at first_instr[n]
* The nth cache is is located at ((HotPyCacheEntry *)first_instr)[-1-n]
* The first cache [-count] is reserved for the count, to enable finding
* the first instruction from the base pointer.
* We need to use the HotPyCacheOrInstruction union to refer to the data
* so as not to break aliasing rules.
*/

static HotPyCacheOrInstruction *
allocate(int cache_count, int instruction_count)
{
assert(sizeof(HotPyCacheOrInstruction) == sizeof(void *));
assert(sizeof(HotPyCacheEntry) == sizeof(void *));
assert(cache_count > 0);
int count = cache_count + (instruction_count + INSTRUCTIONS_PER_ENTRY -1)/INSTRUCTIONS_PER_ENTRY;
HotPyCacheOrInstruction *array = (HotPyCacheOrInstruction *)
PyMem_Malloc(sizeof(HotPyCacheOrInstruction) * count);
if (array == NULL) {
PyErr_NoMemory();
return NULL;
}
array[0].entry.zero.cache_count = cache_count;
return array;
}

static int
get_cache_count(HotPyCacheOrInstruction *quickened) {
return quickened[0].entry.zero.cache_count;
}

static uint8_t adaptive[256] = { 0 };

static uint8_t cache_requirements[256] = { 0 };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These variables could use some comment explaining their purpose. (Also, maybe we should plan to generate these from info added to opcode.py, like opcode_targets.h?


static int
entries_needed(_Py_CODEUNIT *code, int len)
{
int cache_offset = 0;
for (int i = 0; i < len; i++) {
uint8_t opcode = _Py_OPCODE(code[i]);
uint8_t need = cache_requirements[opcode];
if (need == 0) {
continue;
}
assert(adaptive[opcode] != 0);
int oparg = oparg_from_offset_and_index(cache_offset, i);
assert(cache_offset == offset_from_oparg_and_index(oparg, i));
if (oparg < 0) {
cache_offset = i/2;
}
else if (oparg > 255) {
/* Cannot access required cache_offset */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe in some kind of debug mode it would be nice to report whether this happens at all? If we see this frequently we need to change the strategy. OTOH maybe we never expect it and we could put assert(0) here???

continue;
}
cache_offset += need;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this will over-count if there are eligible opcodes with an EXTENDED_ARG prefix.

}
return cache_offset+1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return cache_offset+1;
return cache_offset + 1; // One extra for the count entry

}


static inline _Py_CODEUNIT *
first_instruction(HotPyCacheOrInstruction *quickened)
{
return &quickened[get_cache_count(quickened)].code[0];
}

static void
optimize(HotPyCacheOrInstruction *quickened, int len)
{
_Py_CODEUNIT *instructions = first_instruction(quickened);
int cache_offset = 0;
int previous_opcode = -1;
for(int i = 0; i < len; i++) {
int opcode = _Py_OPCODE(instructions[i]);
uint8_t adaptive_opcode = adaptive[opcode];
if (adaptive_opcode) {
int oparg = oparg_from_offset_and_index(cache_offset, i);
if (oparg < 0) {
cache_offset = i/2;
oparg = 0;
}
if (oparg < 256) {
instructions[i] = _Py_INSTRUCTION(adaptive_opcode, oparg);
cache_offset += cache_requirements[opcode];
}
}
else {
switch (opcode) {
/* Insert superinstructions here
E.g.
case LOAD_FAST:
if (previous_opcode == LOAD_FAST)
instructions[i-1] = _Py_INSTRUCTION(LOAD_FAST__LOAD_FAST, oparg);
*/
}
}
previous_opcode = opcode;
}
assert(cache_offset+1 == get_cache_count(quickened));
}

int
_Py_Quicken(PyCodeObject *code) {
Py_ssize_t size = PyBytes_Size(code->co_code);
int instr_count = (int)(size/sizeof(_Py_CODEUNIT));
if (instr_count > MAX_SIZE_TO_QUICKEN) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible instead to quicken the first 5000 instructions and then exit? (That would avoid a cliff where a minor change in the code tips it over the no-optimization limit and makes it run much slower.)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a memory cost to creating a duplicate code array, so it would risk wasting memory unproportionally.

code->co_warmup = 255;
return 0;
}
if (code->co_quickened) {
return 0;
}
int entry_count = entries_needed(code->co_firstinstr, instr_count);
HotPyCacheOrInstruction *quickened = allocate(entry_count, instr_count);
if (quickened == NULL) {
return -1;
}
_Py_CODEUNIT *new_instructions = first_instruction(quickened);
memcpy(new_instructions, code->co_firstinstr, size);
optimize(quickened, instr_count);
code->co_quickened = quickened;
code->co_firstinstr = new_instructions;
return 0;
}