Skip to content

bpo-26219: per opcode cache for LOAD_GLOBAL #12884

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jun 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Doc/whatsnew/3.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,10 @@ Optimizations
methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`,
:issue:`35582` and :issue:`36127`.)

* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in
:issue:`26219`.)


Build and C API Changes
=======================
Expand Down
17 changes: 17 additions & 0 deletions Include/code.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
# define _Py_OPARG(word) ((word) >> 8)
#endif

typedef struct _PyOpcache _PyOpcache;

/* Bytecode object */
typedef struct {
PyObject_HEAD
Expand Down Expand Up @@ -49,6 +51,21 @@ typedef struct {
Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */
void *co_extra;

/* Per opcodes just-in-time cache
*
* To reduce cache size, we use indirect mapping from opcode index to
* cache object:
* cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
*/

// co_opcache_map is indexed by (next_instr - first_instr).
// * 0 means there is no cache for this opcode.
// * n > 0 means there is cache in co_opcache[n-1].
unsigned char *co_opcache_map;
_PyOpcache *co_opcache;
int co_opcache_flag; // used to determine when create a cache.
unsigned char co_opcache_size; // length of co_opcache.
} PyCodeObject;

/* Masks for co_flags above */
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_ceval.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
PyAPI_FUNC(void) _PyEval_ReInitThreads(
_PyRuntimeState *runtime);

/* Private function */
void _PyEval_Fini(void);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is really private, why include it in the header, instead of just putting the prototype in the .c file?
Also, why call call it a generic ...Eval_Fini as opposed to ...code_stats ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because other private functions are put in "internal" headers too.


#ifdef __cplusplus
}
#endif
Expand Down
27 changes: 27 additions & 0 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef Py_INTERNAL_CODE_H
#define Py_INTERNAL_CODE_H
#ifdef __cplusplus
extern "C" {
#endif

typedef struct {
PyObject *ptr; /* Cached pointer (borrowed reference) */
uint64_t globals_ver; /* ma_version of global dict */
uint64_t builtins_ver; /* ma_version of builtin dict */
} _PyOpcache_LoadGlobal;

struct _PyOpcache {
union {
_PyOpcache_LoadGlobal lg;
} u;
char optimized;
};

/* Private API */
int _PyCode_InitOpcache(PyCodeObject *co);


#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_CODE_H */
6 changes: 3 additions & 3 deletions Lib/test/test_dict_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@ def test_setitem_same_value(self):

# setting a key to the same value with dict.__setitem__
# must change the version
self.check_version_changed(d, d.__setitem__, 'key', value)
self.check_version_dont_change(d, d.__setitem__, 'key', value)

# setting a key to the same value with dict.update
# must change the version
self.check_version_changed(d, d.update, key=value)
self.check_version_dont_change(d, d.update, key=value)

d2 = self.new_dict(key=value)
self.check_version_changed(d, d.update, d2)
self.check_version_dont_change(d, d.update, d2)

def test_setitem_equal(self):
class AlwaysEqual:
Expand Down
1 change: 1 addition & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \
$(srcdir)/Include/internal/pycore_ceval.h \
$(srcdir)/Include/internal/pycore_code.h \
$(srcdir)/Include/internal/pycore_condvar.h \
$(srcdir)/Include/internal/pycore_context.h \
$(srcdir)/Include/internal/pycore_fileutils.h \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
and Inada Naoki.
65 changes: 65 additions & 0 deletions Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

#include "Python.h"
#include "code.h"
#include "opcode.h"
#include "structmember.h"
#include "pycore_code.h"
#include "pycore_pystate.h"
#include "pycore_tupleobject.h"
#include "clinic/codeobject.c.h"
Expand Down Expand Up @@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_zombieframe = NULL;
co->co_weakreflist = NULL;
co->co_extra = NULL;

co->co_opcache_map = NULL;
co->co_opcache = NULL;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Purely as a style question ... why did you change the order of the fields? (as opposed to initializing in definition order)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't changed order in here... I changed only order of struct for compactness and
readability. I didn't changed here because (a) I just missed it and (b) order in here is
not so important than order in struct.

co->co_opcache_flag = 0;
co->co_opcache_size = 0;
return co;
}

int
_PyCode_InitOpcache(PyCodeObject *co)
{
Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
if (co->co_opcache_map == NULL) {
return -1;
}

_Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
Py_ssize_t opts = 0;

for (Py_ssize_t i = 0; i < co_size;) {
unsigned char opcode = _Py_OPCODE(opcodes[i]);
i++; // 'i' is now aligned to (next_instr - first_instr)

// TODO: LOAD_METHOD, LOAD_ATTR
if (opcode == LOAD_GLOBAL) {
co->co_opcache_map[i] = ++opts;
if (opts > 254) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps use 16-bit indices instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To maximize "performance benefit / memory usage", I prefer using 1 byte for each instructions, instead of two bytes.

break;
}
}
}

if (opts) {
co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
if (co->co_opcache == NULL) {
PyMem_FREE(co->co_opcache_map);
return -1;
}
}
else {
PyMem_FREE(co->co_opcache_map);
co->co_opcache_map = NULL;
co->co_opcache = NULL;
}

co->co_opcache_size = opts;
return 0;
}

PyCodeObject *
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
{
Expand Down Expand Up @@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
static void
code_dealloc(PyCodeObject *co)
{
if (co->co_opcache != NULL) {
PyMem_FREE(co->co_opcache);
}
if (co->co_opcache_map != NULL) {
PyMem_FREE(co->co_opcache_map);
}
co->co_opcache_flag = 0;
co->co_opcache_size = 0;

if (co->co_extra != NULL) {
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
_PyCodeObjectExtra *co_extra = co->co_extra;
Expand Down Expand Up @@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
res += sizeof(_PyCodeObjectExtra) +
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
}
if (co->co_opcache != NULL) {
assert(co->co_opcache_map != NULL);
// co_opcache_map
res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
// co_opcache
res += co->co_opcache_size * sizeof(_PyOpcache);
}
return PyLong_FromSsize_t(res);
}

Expand Down
25 changes: 13 additions & 12 deletions Objects/dictobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
return 0;
}

if (_PyDict_HasSplitTable(mp)) {
mp->ma_values[ix] = value;
if (old_value == NULL) {
/* pending state */
assert(ix == mp->ma_used);
mp->ma_used++;
if (old_value != value) {
if (_PyDict_HasSplitTable(mp)) {
mp->ma_values[ix] = value;
if (old_value == NULL) {
/* pending state */
assert(ix == mp->ma_used);
mp->ma_used++;
}
}
else {
assert(old_value != NULL);
DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
}
mp->ma_version_tag = DICT_NEXT_VERSION();
}
else {
assert(old_value != NULL);
DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
}

mp->ma_version_tag = DICT_NEXT_VERSION();
Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
ASSERT_CONSISTENT(mp);
Py_DECREF(key);
Expand Down
1 change: 1 addition & 0 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@
<ClInclude Include="..\Include\import.h" />
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
<ClInclude Include="..\Include\internal\pycore_code.h" />
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
<ClInclude Include="..\Include\internal\pycore_condvar.h" />
<ClInclude Include="..\Include\internal\pycore_context.h" />
Expand Down
3 changes: 3 additions & 0 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@
<ClInclude Include="..\Include\internal\pycore_atomic.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_code.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_ceval.h">
<Filter>Include</Filter>
</ClInclude>
Expand Down
Loading