python · ericsnowcurrently · Jun 7, 2023 · Jun 10, 2023 · Jun 7, 2023 · Jun 7, 2023
@@ -30,6 +30,11 @@ struct _pymem_allocators {
         debug_alloc_api_t obj;
     } debug;
     PyObjectArenaAllocator obj_arena;
+    int num_gils;
+    struct {
+        PyMemAllocatorEx mem;
+        PyMemAllocatorEx obj;
+    } wrapped_with_lock;
 };
 
 

diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
@@ -3,6 +3,7 @@
 #include "Python.h"
 #include "pycore_code.h"          // stats
 #include "pycore_pystate.h"       // _PyInterpreterState_GET
+#include "pycore_ceval.h"         // _PyEval_AcquireLock()
 
 #include "pycore_obmalloc.h"
 #include "pycore_pymem.h"
@@ -209,6 +210,8 @@ _PyMem_ArenaFree(void *Py_UNUSED(ctx), void *ptr,
 #define _PyObject (_PyRuntime.allocators.standard.obj)
 #define _PyMem_Debug (_PyRuntime.allocators.debug)
 #define _PyObject_Arena (_PyRuntime.allocators.obj_arena)
+#define _PyMem_Wrapped (_PyRuntime.allocators.wrapped_with_lock.mem)
+#define _PyObject_Wrapped (_PyRuntime.allocators.wrapped_with_lock.obj)
 
 
 /***************************/
@@ -530,14 +533,32 @@ PyMem_SetupDebugHooks(void)
     PyThread_release_lock(ALLOCATORS_MUTEX);
 }
 
+static int has_locking_wrapper(PyMemAllocatorEx *allocator);
+
 static void
 get_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
 {
     switch(domain)
     {
-    case PYMEM_DOMAIN_RAW: *allocator = _PyMem_Raw; break;
-    case PYMEM_DOMAIN_MEM: *allocator = _PyMem; break;
-    case PYMEM_DOMAIN_OBJ: *allocator = _PyObject; break;
+    case PYMEM_DOMAIN_RAW:
+        *allocator = _PyMem_Raw;
+        break;
+    case PYMEM_DOMAIN_MEM:
+        if (has_locking_wrapper(&_PyMem)) {
+            *allocator = *(PyMemAllocatorEx *)_PyMem.ctx;
+        }
+        else {
+            *allocator = _PyMem;
+        }
+        break;
+    case PYMEM_DOMAIN_OBJ:
+        if (has_locking_wrapper(&_PyObject)) {
+            *allocator = *(PyMemAllocatorEx *)_PyObject.ctx;
+        }
+        else {
+            *allocator = _PyObject;
+        }
+        break;
     default:
         /* unknown domain: set all attributes to NULL */
         allocator->ctx = NULL;
@@ -548,15 +569,28 @@ get_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
     }
 }
 
+static void maybe_add_locking_wrapper(
+        PyMemAllocatorDomain, PyMemAllocatorEx *, PyMemAllocatorEx *);
+
 static void
 set_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
 {
     switch(domain)
     {
-    case PYMEM_DOMAIN_RAW: _PyMem_Raw = *allocator; break;
-    case PYMEM_DOMAIN_MEM: _PyMem = *allocator; break;
-    case PYMEM_DOMAIN_OBJ: _PyObject = *allocator; break;
-    /* ignore unknown domain */
+    case PYMEM_DOMAIN_RAW:
+        _PyMem_Raw = *allocator;
+        break;
+    case PYMEM_DOMAIN_MEM:
+        _PyMem = *allocator;
+        maybe_add_locking_wrapper(domain, &_PyMem, &_PyMem_Wrapped);
+        break;
+    case PYMEM_DOMAIN_OBJ:
+        _PyObject = *allocator;
+        maybe_add_locking_wrapper(domain, &_PyObject, &_PyObject_Wrapped);
+        break;
+    default:
+        /* ignore unknown domain */
+        return;
     }
 }
 
@@ -627,6 +661,192 @@ PyObject_SetArenaAllocator(PyObjectArenaAllocator *allocator)
  * around these uses of the runtime-global allocators state. */
 
 
+/************************************/
+/* locking around custom allocators */
+/************************************/
+
+static inline int
+should_lock(PyInterpreterState *interp)
+{
+    return interp->ceval.gil != _PyInterpreterState_Main()->ceval.gil;
+}
+
+static PyThreadState *
+get_tstate_for_main_gil(void)
+{
+    PyThreadState *tstate = _PyRuntimeState_GetFinalizing(&_PyRuntime);
+    if (tstate == NULL) {
+        /* To use its GIL, we only need the pointer and one field. */
+        static const PyThreadState _main_tstate = {
+            .interp = &_PyRuntime._main_interpreter,
+        };
+        tstate = (PyThreadState *)&_main_tstate;
+    }
+    return tstate;
+}
+
+static inline void
+acquire_custom_allocator_lock(PyThreadState *tstate)
+{
+    _PyEval_AcquireLock(tstate);
+}
+
+static inline void
+release_custom_allocator_lock(PyThreadState *tstate)
+{
+    _PyEval_ReleaseLock(tstate->interp, tstate);
+}
+
+static void *
+_PyMem_MallocLocked(void *ctx, size_t size)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            void *ptr = wrapped->malloc(wrapped->ctx, size);
+            release_custom_allocator_lock(tstate);
+            return ptr;
+        }
+    }
+    return wrapped->malloc(wrapped->ctx, size);
+}
+
+static void *
+_PyMem_CallocLocked(void *ctx, size_t nelem, size_t elsize)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            void *ptr = wrapped->calloc(wrapped->ctx, nelem, elsize);
+            release_custom_allocator_lock(tstate);
+            return ptr;
+        }
+    }
+    return wrapped->calloc(wrapped->ctx, nelem, elsize);
+}
+
+static void *
+_PyMem_ReallocLocked(void *ctx, void *ptr, size_t new_size)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            ptr = wrapped->realloc(wrapped->ctx, ptr, new_size);
+            release_custom_allocator_lock(tstate);
+            return ptr;
+        }
+    }
+    return wrapped->realloc(wrapped->ctx, ptr, new_size);
+}
+
+static void
+_PyMem_FreeLocked(void *ctx, void *ptr)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            wrapped->free(wrapped->ctx, ptr);
+            release_custom_allocator_lock(tstate);
+            return;
+        }
+    }
+    wrapped->free(wrapped->ctx, ptr);
+}
+
+static int
+has_locking_wrapper(PyMemAllocatorEx *allocator)
+{
+    if (allocator->ctx == NULL) {
+        return 0;
+    }
+    return (allocator->malloc == _PyMem_MallocLocked
+            || allocator->calloc == _PyMem_CallocLocked
+            || allocator->realloc == _PyMem_ReallocLocked
+            || allocator->free == _PyMem_FreeLocked);
+}
+
+static void
+maybe_add_locking_wrapper(PyMemAllocatorDomain domain,
+                          PyMemAllocatorEx *allocator,
+                          PyMemAllocatorEx *wrapped)
+{
+    assert(domain == PYMEM_DOMAIN_MEM || domain == PYMEM_DOMAIN_OBJ);
+
+    *wrapped = (PyMemAllocatorEx){0};
+
+    if (allocator->malloc == _PyMem_DebugMalloc) {
+        /* The debug allocator only wraps an already set allocator,
+         * which would have gone through this function already. */
+        return;
+    }
+
+    void *ctx = allocator->ctx;
+
+    /* What is the likelihood of reentrancy with the wrapper funcs?
+     * For now we assume it is effectively zero. */
+
+    if (allocator->malloc != _PyMem_RawMalloc
+#ifdef WITH_PYMALLOC
+        && allocator->malloc != _PyObject_Malloc
+#endif
+        && allocator->malloc != _PyMem_MallocLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->malloc = allocator->malloc;
+        allocator->ctx = wrapped;
+        allocator->malloc = _PyMem_MallocLocked;
+    }
+
+    if (allocator->calloc != _PyMem_RawCalloc
+#ifdef WITH_PYMALLOC
+        && allocator->calloc != _PyObject_Calloc
+#endif
+        && allocator->calloc != _PyMem_CallocLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->calloc = allocator->calloc;
+        allocator->ctx = wrapped;
+        allocator->calloc = _PyMem_CallocLocked;
+    }
+
+    if (allocator->realloc != _PyMem_RawRealloc
+#ifdef WITH_PYMALLOC
+        && allocator->realloc != _PyObject_Realloc
+#endif
+        && allocator->realloc != _PyMem_ReallocLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->realloc = allocator->realloc;
+        allocator->ctx = wrapped;
+        allocator->realloc = _PyMem_ReallocLocked;
+    }
+
+    if (allocator->free != _PyMem_RawFree
+#ifdef WITH_PYMALLOC
+        && allocator->free != _PyObject_Free
+#endif
+        && allocator->free != _PyMem_FreeLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->free = allocator->free;
+        allocator->ctx = wrapped;
+        allocator->free = _PyMem_FreeLocked;
+    }
+}
+
+
 /*************************/
 /* the "arena" allocator */
 /*************************/

@@ -586,6 +586,7 @@ static PyStatus
 init_interp_create_gil(PyThreadState *tstate, int own_gil)
 {
     PyStatus status;
+    _PyRuntimeState *runtime = tstate->interp->runtime;
 
     /* finalize_interp_delete() comment explains why _PyEval_FiniGIL() is
        only called here. */
@@ -603,6 +604,9 @@ init_interp_create_gil(PyThreadState *tstate, int own_gil)
     if (_PyStatus_EXCEPTION(status)) {
         return status;
     }
+    HEAD_LOCK(runtime);
+    runtime->allocators.num_gils++;
+    HEAD_UNLOCK(runtime);
 
     return _PyStatus_OK();
 }
@@ -1730,6 +1734,11 @@ finalize_interp_delete(PyInterpreterState *interp)
     /* Cleanup auto-thread-state */
     _PyGILState_Fini(interp);
 
+    _PyRuntimeState *runtime = interp->runtime;
+    HEAD_LOCK(runtime);
+    runtime->allocators.num_gils--;
+    HEAD_UNLOCK(runtime);
+
     /* We can't call _PyEval_FiniGIL() here because destroying the GIL lock can
        fail when it is being awaited by another running daemon thread (see
        bpo-9901). Instead pycore_create_interpreter() destroys the previously