Skip to content

bpo-44187: Quickening infrastructure #26264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7fae680
Add co_firstinstr field to code object.
markshannon May 17, 2021
c192bf2
Implement barebones quickening.
markshannon May 19, 2021
edce8de
Cleanup quickening internal API and use non-quickened bytecode when t…
markshannon May 20, 2021
2d8b14a
Tweak internal quickening API.
markshannon May 20, 2021
af5b90e
Flesh out superinstruction insertion code a bit.
markshannon May 20, 2021
526be3b
Add NEWS item
markshannon May 20, 2021
6577457
Add new file to Windows build.
markshannon May 20, 2021
bc7b418
Fix up public symbol.
markshannon May 20, 2021
b54feff
Tweaks
markshannon May 21, 2021
ffd6e87
Merge branch 'main' into quickening-infrastructure
markshannon May 21, 2021
8c12a0a
Clarify commments, fix assertions and switch sign of counter represen…
markshannon May 24, 2021
9e1a771
Remove 'HotPy' prefixes.
markshannon May 24, 2021
f0acdf0
Add more explanatory comments and rename a few macros for clarity.
markshannon May 25, 2021
8bd4487
Don't specialize instructions with EXTENDED_ARG.
markshannon May 25, 2021
d0ca916
Convert tracing dispatch to macro to ease keeping it in sync with nor…
markshannon May 25, 2021
ae520e5
Rename macro to avoid name clash.
markshannon May 26, 2021
e329b2e
Clarify and refactor quickening code. Account for EXTENDED_ARGs when …
markshannon May 27, 2021
9345959
Merge branch 'main' into quickening-infrastructure
markshannon May 27, 2021
2c06ed4
Move more heavily used fields of code object to front.
markshannon May 27, 2021
2d4e416
Fix more typos
markshannon May 27, 2021
39b3a93
Merge branch 'main' into quickening-infrastructure
markshannon May 27, 2021
ee2dae1
Make means of offset calculation explicit.
markshannon May 28, 2021
12078f0
Merge branch 'main' into quickening-infrastructure
markshannon May 28, 2021
16b985d
Add more explanatory comments.
markshannon Jun 1, 2021
8ea4b85
Use index+1 for speed and initialize adaptive cache entry when perfor…
markshannon Jun 1, 2021
55e673b
Add comment
markshannon Jun 1, 2021
b7c3995
Make sure that the uses of instruction index versus uses of nexti (in…
markshannon Jun 1, 2021
ecfa62b
Merge branch 'main' into quickening-infrastructure
markshannon Jun 4, 2021
ec50298
Merge branch 'main' into quickening-infrastructure
markshannon Jun 6, 2021
ab3a30b
Fix refleaks tests to account for quickened blocks
markshannon Jun 7, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions Include/cpython/code.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ typedef uint16_t _Py_CODEUNIT;
#ifdef WORDS_BIGENDIAN
# define _Py_OPCODE(word) ((word) >> 8)
# define _Py_OPARG(word) ((word) & 255)
# define _Py_MAKECODEUNIT(opcode, oparg) (((opcode)<<8)|(oparg))
#else
# define _Py_OPCODE(word) ((word) & 255)
# define _Py_OPARG(word) ((word) >> 8)
# define _Py_MAKECODEUNIT(opcode, oparg) ((opcode)|((oparg)<<8))
#endif

typedef struct _PyOpcache _PyOpcache;
Expand Down Expand Up @@ -43,24 +45,27 @@ struct PyCodeObject {
/* These fields are set with provided values on new code objects. */

// The hottest fields (in the eval loop) are grouped here at the top.
PyObject *co_code; /* instruction opcodes */
PyObject *co_consts; /* list (constants used) */
PyObject *co_names; /* list of strings (names used) */
_Py_CODEUNIT *co_firstinstr; /* Pointer to first instruction, used for quickening */
PyObject *co_exceptiontable; /* Byte string encoding exception handling table */
int co_flags; /* CO_..., see below */
int co_warmup; /* Warmup counter for quickening */

// The rest are not so impactful on performance.
int co_argcount; /* #arguments, except *args */
int co_posonlyargcount; /* #positional only arguments */
int co_kwonlyargcount; /* #keyword only arguments */
int co_stacksize; /* #entries needed for evaluation stack */
int co_firstlineno; /* first source line number */
PyObject *co_code; /* instruction opcodes */
PyObject *co_varnames; /* tuple of strings (local variable names) */
PyObject *co_cellvars; /* tuple of strings (cell variable names) */
PyObject *co_freevars; /* tuple of strings (free variable names) */
PyObject *co_filename; /* unicode (where it was loaded from) */
PyObject *co_name; /* unicode (name, for reference) */
PyObject *co_linetable; /* string (encoding addr<->lineno mapping) See
Objects/lnotab_notes.txt for details. */
PyObject *co_exceptiontable; /* Byte string encoding exception handling table */

/* These fields are set with computed values on new code objects. */

Expand All @@ -78,6 +83,10 @@ struct PyCodeObject {
Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */
void *co_extra;
/* Quickened instructions and cache, or NULL
This should be treated as opaque by all code except the specializer and
interpreter. */
union _cache_or_instruction *co_quickened;

/* Per opcodes just-in-time cache
*
Expand Down
101 changes: 100 additions & 1 deletion Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#ifdef __cplusplus
extern "C" {
#endif


/* Legacy Opcache */

typedef struct {
PyObject *ptr; /* Cached pointer (borrowed reference) */
uint64_t globals_ver; /* ma_version of global dict */
Expand All @@ -24,6 +26,103 @@ struct _PyOpcache {
char optimized;
};


/* PEP 659
* Specialization and quickening structs and helper functions
*/

typedef struct {
int32_t cache_count;
int32_t _; /* Force 8 byte size */
} _PyEntryZero;

/* Add specialized versions of entries to this union.
*
* Do not break the invariant: sizeof(SpecializedCacheEntry) == 8
* Preserving this invariant is necessary because:
- If any one form uses more space, then all must and on 64 bit machines
this is likely to double the memory consumption of caches
- The function for calculating the offset of caches assumes a 4:1
cache:instruction size ratio. Changing that would need careful
analysis to choose a new function.
*/
typedef union {
_PyEntryZero zero;
PyObject *object;
} SpecializedCacheEntry;

#define INSTRUCTIONS_PER_ENTRY (sizeof(SpecializedCacheEntry)/sizeof(_Py_CODEUNIT))

/* Maximum size of code to quicken, in code units. */
#define MAX_SIZE_TO_QUICKEN 5000

typedef union _cache_or_instruction {
_Py_CODEUNIT code[1];
SpecializedCacheEntry entry;
} SpecializedCacheOrInstruction;

/* Get pointer to the nth cache entry, from the first instruction and n.
* Cache entries are index backwards, with [count-1] first in memory, and [0] last.
* The zeroth entry immediately precedes the instructions.
*/
static inline SpecializedCacheEntry *
_GetSpecializedCacheEntry(_Py_CODEUNIT *first_instr, Py_ssize_t n)
{
SpecializedCacheOrInstruction *last_cache_plus_one = (SpecializedCacheOrInstruction *)first_instr;
assert(&last_cache_plus_one->code[0] == first_instr);
return &last_cache_plus_one[-1-n].entry;
}

/* Following two functions from a pair.
*
* oparg_from_offset_and_index() is used to compute the oparg
* when quickening, so that offset_from_oparg_and_index()
* can be used at runtime to compute the offset.
*/
static inline int
oparg_from_offset_and_index(int offset, int index)
{
return offset-(index>>1);
}

static inline int
offset_from_oparg_and_index(int oparg, int index)
{
return (index>>1)+oparg;
}

/* Get pointer to the cache entry associated with an instruction.
This doesn't check that an entry has been allocated for that instruction. */
static inline SpecializedCacheEntry *
_GetSpecializedCacheEntryForInstruction(_Py_CODEUNIT *first_instr, int index, int oparg)
{
return _GetSpecializedCacheEntry(
first_instr,
offset_from_oparg_and_index(oparg, index)
);
}

#define QUICKENING_WARMUP_DELAY 8

/* We want to compare to zero for efficiency, so we offset values accordingly */
#define QUICKENING_INITIAL_WARMUP_VALUE (-QUICKENING_WARMUP_DELAY)
#define QUICKENING_WARMUP_COLDEST 1

static inline void
PyCodeObject_IncrementWarmup(PyCodeObject * co)
{
co->co_warmup++;
}

/* Used by the interpreter to determine when a code object should be quickened */
static inline int
PyCodeObject_IsWarmedUp(PyCodeObject * co)
{
return (co->co_warmup == 0);
}

int _Py_Quicken(PyCodeObject *code);

/* Private API */
int _PyCode_InitOpcache(PyCodeObject *co);

Expand Down
1 change: 1 addition & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ PYTHON_OBJS= \
Python/pythonrun.o \
Python/pytime.o \
Python/bootstrap_hash.o \
Python/specialize.o \
Python/structmember.o \
Python/symtable.o \
Python/sysmodule.o \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Implement quickening in the interpreter. This offers no advantages as
yet, but is an enabler of future optimizations. See PEP 659 for full
explanation.
8 changes: 8 additions & 0 deletions Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,11 @@ PyCode_NewWithPosOnlyArgs(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_nfreevars = nfreevars;
co->co_stacksize = stacksize;
co->co_flags = flags;
assert(PyBytes_GET_SIZE(code) % sizeof(_Py_CODEUNIT) == 0);
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(code), sizeof(_Py_CODEUNIT)));
Py_INCREF(code);
co->co_code = code;
co->co_firstinstr = (_Py_CODEUNIT *)PyBytes_AS_STRING(code);
Py_INCREF(consts);
co->co_consts = consts;
Py_INCREF(names);
Expand Down Expand Up @@ -322,6 +325,8 @@ PyCode_NewWithPosOnlyArgs(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_opcache = NULL;
co->co_opcache_flag = 0;
co->co_opcache_size = 0;
co->co_warmup = QUICKENING_INITIAL_WARMUP_VALUE;
co->co_quickened = NULL;
return co;
}

Expand Down Expand Up @@ -980,6 +985,9 @@ code_dealloc(PyCodeObject *co)
PyMem_Free(co->co_cell2arg);
if (co->co_weakreflist != NULL)
PyObject_ClearWeakRefs((PyObject*)co);
if (co->co_quickened) {
PyMem_Free(co->co_quickened);
}
PyObject_Free(co);
}

Expand Down
1 change: 1 addition & 0 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@
<ClCompile Include="..\Python\dtoa.c" />
<ClCompile Include="..\Python\Python-ast.c" />
<ClCompile Include="..\Python\pythonrun.c" />
<ClCompile Include="..\Python\specialize.c" />
<ClCompile Include="..\Python\suggestions.c" />
<ClCompile Include="..\Python\structmember.c" />
<ClCompile Include="..\Python\symtable.c" />
Expand Down
3 changes: 3 additions & 0 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,9 @@
<ClCompile Include="..\Python\pythonrun.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Python\specialize.c">
<Filter>Python</Filter>
</ClCompile>
<ClCompile Include="..\Python\structmember.c">
<Filter>Python</Filter>
</ClCompile>
Expand Down
28 changes: 22 additions & 6 deletions Python/ceval.c
Original file line number Diff line number Diff line change
Expand Up @@ -1343,6 +1343,14 @@ eval_frame_handle_pending(PyThreadState *tstate)
#define JUMPTO(x) (next_instr = first_instr + (x))
#define JUMPBY(x) (next_instr += (x))

/* Get opcode and oparg from original instructions, not quickened form. */
#define TRACING_NEXTOPARG() do { \
_Py_CODEUNIT word = ((_Py_CODEUNIT *)PyBytes_AS_STRING(co->co_code))[INSTR_OFFSET()]; \
opcode = _Py_OPCODE(word); \
oparg = _Py_OPARG(word); \
next_instr++; \
} while (0)

/* OpCode prediction macros
Some opcodes tend to come in pairs thus making it possible to
predict the second code when the first is run. For example,
Expand Down Expand Up @@ -1644,15 +1652,23 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
if (PyDTrace_FUNCTION_ENTRY_ENABLED())
dtrace_function_entry(f);

/* Increment the warmup counter and quicken if warm enough
* _Py_Quicken is idempotent so we don't worry about overflow */
if (!PyCodeObject_IsWarmedUp(co)) {
PyCodeObject_IncrementWarmup(co);
if (PyCodeObject_IsWarmedUp(co)) {
if (_Py_Quicken(co)) {
goto exit_eval_frame;
}
}
}


names = co->co_names;
consts = co->co_consts;
fastlocals = f->f_localsptr;
freevars = f->f_localsptr + co->co_nlocals;
assert(PyBytes_Check(co->co_code));
assert(PyBytes_GET_SIZE(co->co_code) <= INT_MAX);
assert(PyBytes_GET_SIZE(co->co_code) % sizeof(_Py_CODEUNIT) == 0);
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(co->co_code), sizeof(_Py_CODEUNIT)));
first_instr = (_Py_CODEUNIT *) PyBytes_AS_STRING(co->co_code);
first_instr = co->co_firstinstr;
/*
f->f_lasti refers to the index of the last instruction,
unless it's -1 in which case next_instr should be first_instr.
Expand Down Expand Up @@ -1757,7 +1773,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)

tracing_dispatch:
f->f_lasti = INSTR_OFFSET();
NEXTOPARG();
TRACING_NEXTOPARG();

if (PyDTrace_LINE_ENABLED())
maybe_dtrace_line(f, &trace_info);
Expand Down
Loading