-
-
Notifications
You must be signed in to change notification settings - Fork 32.5k
bpo-26219: per opcode cache for LOAD_GLOBAL #12884
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2057cd7
ce7a59c
6a99da6
63d417c
9726902
4502f6c
f7b8d3a
774e13e
78e8b59
913fbee
d5250de
68480ed
f933246
6d49979
1616657
69be2e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#ifndef Py_INTERNAL_CODE_H | ||
#define Py_INTERNAL_CODE_H | ||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
typedef struct { | ||
methane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
PyObject *ptr; /* Cached pointer (borrowed reference) */ | ||
uint64_t globals_ver; /* ma_version of global dict */ | ||
uint64_t builtins_ver; /* ma_version of builtin dict */ | ||
} _PyOpcache_LoadGlobal; | ||
|
||
struct _PyOpcache { | ||
union { | ||
_PyOpcache_LoadGlobal lg; | ||
} u; | ||
char optimized; | ||
}; | ||
|
||
/* Private API */ | ||
int _PyCode_InitOpcache(PyCodeObject *co); | ||
|
||
|
||
#ifdef __cplusplus | ||
} | ||
#endif | ||
#endif /* !Py_INTERNAL_CODE_H */ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use | ||
it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov, | ||
and Inada Naoki. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,9 @@ | |
|
||
#include "Python.h" | ||
#include "code.h" | ||
#include "opcode.h" | ||
#include "structmember.h" | ||
#include "pycore_code.h" | ||
#include "pycore_pystate.h" | ||
#include "pycore_tupleobject.h" | ||
#include "clinic/codeobject.c.h" | ||
|
@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount, | |
co->co_zombieframe = NULL; | ||
co->co_weakreflist = NULL; | ||
co->co_extra = NULL; | ||
|
||
co->co_opcache_map = NULL; | ||
co->co_opcache = NULL; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Purely as a style question ... why did you change the order of the fields? (as opposed to initializing in definition order) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't changed order in here... I changed only order of struct for compactness and |
||
co->co_opcache_flag = 0; | ||
co->co_opcache_size = 0; | ||
return co; | ||
} | ||
|
||
int | ||
_PyCode_InitOpcache(PyCodeObject *co) | ||
{ | ||
Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT); | ||
co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1); | ||
if (co->co_opcache_map == NULL) { | ||
return -1; | ||
} | ||
|
||
_Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code); | ||
Py_ssize_t opts = 0; | ||
|
||
for (Py_ssize_t i = 0; i < co_size;) { | ||
unsigned char opcode = _Py_OPCODE(opcodes[i]); | ||
i++; // 'i' is now aligned to (next_instr - first_instr) | ||
|
||
// TODO: LOAD_METHOD, LOAD_ATTR | ||
if (opcode == LOAD_GLOBAL) { | ||
co->co_opcache_map[i] = ++opts; | ||
if (opts > 254) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps use 16-bit indices instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To maximize "performance benefit / memory usage", I prefer using 1 byte for each instructions, instead of two bytes. |
||
break; | ||
} | ||
} | ||
} | ||
|
||
if (opts) { | ||
co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache)); | ||
if (co->co_opcache == NULL) { | ||
PyMem_FREE(co->co_opcache_map); | ||
return -1; | ||
} | ||
} | ||
else { | ||
PyMem_FREE(co->co_opcache_map); | ||
co->co_opcache_map = NULL; | ||
co->co_opcache = NULL; | ||
} | ||
|
||
co->co_opcache_size = opts; | ||
return 0; | ||
} | ||
|
||
PyCodeObject * | ||
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno) | ||
{ | ||
|
@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw) | |
static void | ||
code_dealloc(PyCodeObject *co) | ||
{ | ||
if (co->co_opcache != NULL) { | ||
PyMem_FREE(co->co_opcache); | ||
} | ||
if (co->co_opcache_map != NULL) { | ||
PyMem_FREE(co->co_opcache_map); | ||
} | ||
co->co_opcache_flag = 0; | ||
co->co_opcache_size = 0; | ||
|
||
if (co->co_extra != NULL) { | ||
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); | ||
_PyCodeObjectExtra *co_extra = co->co_extra; | ||
|
@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args)) | |
res += sizeof(_PyCodeObjectExtra) + | ||
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]); | ||
} | ||
if (co->co_opcache != NULL) { | ||
assert(co->co_opcache_map != NULL); | ||
// co_opcache_map | ||
res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT); | ||
// co_opcache | ||
res += co->co_opcache_size * sizeof(_PyOpcache); | ||
} | ||
return PyLong_FromSsize_t(res); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it is really private, why include it in the header, instead of just putting the prototype in the .c file?
Also, why call call it a generic ...Eval_Fini as opposed to ...code_stats ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because other private functions are put in "internal" headers too.