python · methane · Jun 3, 2019 · Apr 19, 2019 · Apr 21, 2019 · Apr 21, 2019
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
@@ -844,6 +844,10 @@ Optimizations
   methods up to 20--50%.  (Contributed by Serhiy Storchaka in :issue:`23867`,
   :issue:`35582` and :issue:`36127`.)
 
+* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
+  It is about 40% faster now.  (Contributed by Yury Selivanov and Inada Naoki in
+  :issue:`26219`.)
+
 
 Build and C API Changes
 =======================

diff --git a/Include/code.h b/Include/code.h
@@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
 #  define _Py_OPARG(word) ((word) >> 8)
 #endif
 
+typedef struct _PyOpcache _PyOpcache;
+
 /* Bytecode object */
 typedef struct {
     PyObject_HEAD
@@ -49,6 +51,21 @@ typedef struct {
        Type is a void* to keep the format private in codeobject.c to force
        people to go through the proper APIs. */
     void *co_extra;
+
+    /* Per opcodes just-in-time cache
+     *
+     * To reduce cache size, we use indirect mapping from opcode index to
+     * cache object:
+     *   cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
+     */
+
+    // co_opcache_map is indexed by (next_instr - first_instr).
+    //  * 0 means there is no cache for this opcode.
+    //  * n > 0 means there is cache in co_opcache[n-1].
+    unsigned char *co_opcache_map;
+    _PyOpcache *co_opcache;
+    int co_opcache_flag;  // used to determine when create a cache.
+    unsigned char co_opcache_size;  // length of co_opcache.
 } PyCodeObject;
 
 /* Masks for co_flags above */

diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
@@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
 PyAPI_FUNC(void) _PyEval_ReInitThreads(
     _PyRuntimeState *runtime);
 
+/* Private function */
+void _PyEval_Fini(void);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    PyObject *ptr;  /* Cached pointer (borrowed reference) */
+    uint64_t globals_ver;  /* ma_version of global dict */
+    uint64_t builtins_ver; /* ma_version of builtin dict */
+} _PyOpcache_LoadGlobal;
+
+struct _PyOpcache {
+    union {
+        _PyOpcache_LoadGlobal lg;
+    } u;
+    char optimized;
+};
+
+/* Private API */
+int _PyCode_InitOpcache(PyCodeObject *co);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */
diff --git a/Lib/test/test_dict_version.py b/Lib/test/test_dict_version.py
@@ -80,14 +80,14 @@ def test_setitem_same_value(self):
 
         # setting a key to the same value with dict.__setitem__
         # must change the version
-        self.check_version_changed(d, d.__setitem__, 'key', value)
+        self.check_version_dont_change(d, d.__setitem__, 'key', value)
 
         # setting a key to the same value with dict.update
         # must change the version
-        self.check_version_changed(d, d.update, key=value)
+        self.check_version_dont_change(d, d.update, key=value)
 
         d2 = self.new_dict(key=value)
-        self.check_version_changed(d, d.update, d2)
+        self.check_version_dont_change(d, d.update, d2)
 
     def test_setitem_equal(self):
         class AlwaysEqual:

diff --git a/Makefile.pre.in b/Makefile.pre.in
@@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/internal/pycore_accu.h \
 		$(srcdir)/Include/internal/pycore_atomic.h \
 		$(srcdir)/Include/internal/pycore_ceval.h \
+		$(srcdir)/Include/internal/pycore_code.h \
 		$(srcdir)/Include/internal/pycore_condvar.h \
 		$(srcdir)/Include/internal/pycore_context.h \
 		$(srcdir)/Include/internal/pycore_fileutils.h \

diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst b/Misc/NEWS.d/next/Core and Builtins/2019-05-29-22-03-09.bpo-26219.Ovf1Qs.rst
@@ -0,0 +1,3 @@
+Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
+it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
+and Inada Naoki.
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
@@ -2,7 +2,9 @@
 
 #include "Python.h"
 #include "code.h"
+#include "opcode.h"
 #include "structmember.h"
+#include "pycore_code.h"
 #include "pycore_pystate.h"
 #include "pycore_tupleobject.h"
 #include "clinic/codeobject.c.h"
@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
     co->co_zombieframe = NULL;
     co->co_weakreflist = NULL;
     co->co_extra = NULL;
+
+    co->co_opcache_map = NULL;
+    co->co_opcache = NULL;
+    co->co_opcache_flag = 0;
+    co->co_opcache_size = 0;
     return co;
 }
 
+int
+_PyCode_InitOpcache(PyCodeObject *co)
+{
+    Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
+    co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
+    if (co->co_opcache_map == NULL) {
+        return -1;
+    }
+
+    _Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
+    Py_ssize_t opts = 0;
+
+    for (Py_ssize_t i = 0; i < co_size;) {
+        unsigned char opcode = _Py_OPCODE(opcodes[i]);
+        i++;  // 'i' is now aligned to (next_instr - first_instr)
+
+        // TODO: LOAD_METHOD, LOAD_ATTR
+        if (opcode == LOAD_GLOBAL) {
+            co->co_opcache_map[i] = ++opts;
+            if (opts > 254) {
+                break;
+            }
+        }
+    }
+
+    if (opts) {
+        co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
+        if (co->co_opcache == NULL) {
+            PyMem_FREE(co->co_opcache_map);
+            return -1;
+        }
+    }
+    else {
+        PyMem_FREE(co->co_opcache_map);
+        co->co_opcache_map = NULL;
+        co->co_opcache = NULL;
+    }
+
+    co->co_opcache_size = opts;
+    return 0;
+}
+
 PyCodeObject *
 PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
 {
@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
 static void
 code_dealloc(PyCodeObject *co)
 {
+    if (co->co_opcache != NULL) {
+        PyMem_FREE(co->co_opcache);
+    }
+    if (co->co_opcache_map != NULL) {
+        PyMem_FREE(co->co_opcache_map);
+    }
+    co->co_opcache_flag = 0;
+    co->co_opcache_size = 0;
+
     if (co->co_extra != NULL) {
         PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
         _PyCodeObjectExtra *co_extra = co->co_extra;
@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
         res += sizeof(_PyCodeObjectExtra) +
                (co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
     }
+    if (co->co_opcache != NULL) {
+        assert(co->co_opcache_map != NULL);
+        // co_opcache_map
+        res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
+        // co_opcache
+        res += co->co_opcache_size * sizeof(_PyOpcache);
+    }
     return PyLong_FromSsize_t(res);
 }
 

diff --git a/Objects/dictobject.c b/Objects/dictobject.c
@@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
         return 0;
     }
 
-    if (_PyDict_HasSplitTable(mp)) {
-        mp->ma_values[ix] = value;
-        if (old_value == NULL) {
-            /* pending state */
-            assert(ix == mp->ma_used);
-            mp->ma_used++;
+    if (old_value != value) {
+        if (_PyDict_HasSplitTable(mp)) {
+            mp->ma_values[ix] = value;
+            if (old_value == NULL) {
+                /* pending state */
+                assert(ix == mp->ma_used);
+                mp->ma_used++;
+            }
         }
+        else {
+            assert(old_value != NULL);
+            DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
+        }
+        mp->ma_version_tag = DICT_NEXT_VERSION();
     }
-    else {
-        assert(old_value != NULL);
-        DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
-    }
-
-    mp->ma_version_tag = DICT_NEXT_VERSION();
     Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
     ASSERT_CONSISTENT(mp);
     Py_DECREF(key);

diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
@@ -158,6 +158,7 @@
     <ClInclude Include="..\Include\import.h" />
     <ClInclude Include="..\Include\internal\pycore_accu.h" />
     <ClInclude Include="..\Include\internal\pycore_atomic.h" />
+    <ClInclude Include="..\Include\internal\pycore_code.h" />
     <ClInclude Include="..\Include\internal\pycore_ceval.h" />
     <ClInclude Include="..\Include\internal\pycore_condvar.h" />
     <ClInclude Include="..\Include\internal\pycore_context.h" />

diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
@@ -177,6 +177,9 @@
     <ClInclude Include="..\Include\internal\pycore_atomic.h">
       <Filter>Include</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_code.h">
+      <Filter>Include</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_ceval.h">
       <Filter>Include</Filter>
     </ClInclude>