From 4fd24bc4106a30e2e94d35f6b902e92850770d31 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Sat, 23 May 2026 23:04:14 +0200 Subject: [PATCH] gh-NNNNN: Make _STORE_ATTR_SLOT lock-free on the free-threaded build Replace the LOCK_OBJECT / atomic-release-store / UNLOCK_OBJECT sequence in _STORE_ATTR_SLOT (and the matching critical section in PyMember_SetOne for Py_T_OBJECT_EX / _Py_T_OBJECT) with a single atomic exchange on the slot pointer. Atomic exchange returns a unique old value per writer, so Py_XDECREF cannot double-free across concurrent writers. Concurrent readers (_LOAD_ATTR_SLOT, PyMember_GetOne) already use atomic load + _Py_TryIncrefCompare; PyMember_GetOne's locked fallback is replaced by _Py_XGetRef so it also stays correct against lock-free writers. Drops HAS_DEOPT_FLAG from _STORE_ATTR_SLOT (no LOCK can fail anymore), which removes one _DEOPT exit per occurrence from Tier 2 traces. Adds _Py_atomic_exchange_ptr to the cases-generator's NON_ESCAPING_FUNCTIONS allowlist so the generator stops wrapping it in _PyFrame_SetStackPointer / _GetStackPointer. Adds a concurrent _Py_T_OBJECT test (test_T_OBJECT in test_slots.py). Benchmarks (PGO+LTO, taskset -c 4, pyperf): micro_store_attr_slot 79.0 ns -> 54.5 ns 1.45x faster micro_attr_idiv 180 ns -> 149 ns 1.21x faster bm_float 87.4 ms -> 83.7 ms 1.04x faster bm_nbody (control) unchanged Co-Authored-By: Claude Opus 4.7 (1M context) --- Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_uop_metadata.h | 2 +- Lib/test/test_free_threading/test_slots.py | 18 ++++++++ Modules/_testinternalcapi/test_cases.c.h | 13 +++--- Python/bytecodes.c | 15 +++++-- Python/executor_cases.c.h | 15 +++---- Python/generated_cases.c.h | 13 +++--- Python/structmember.c | 52 +++++++++++----------- Tools/cases_generator/analyzer.py | 1 + 9 files changed, 77 insertions(+), 54 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index d2e29a1b95ede2f..e9a76e2f3f2ca34 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1315,7 +1315,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[267] = { [SET_UPDATE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [STORE_ATTR] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [STORE_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, - [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, + [STORE_ATTR_SLOT] = { true, INSTR_FMT_IXC000, HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, [STORE_ATTR_WITH_HINT] = { true, INSTR_FMT_IBC000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, [STORE_DEREF] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_FREE_FLAG | HAS_ESCAPES_FLAG }, [STORE_FAST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ESCAPES_FLAG }, diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 6713e9bc95f942d..e572110dbdd28fc 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -239,7 +239,7 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = { [_STORE_ATTR_INSTANCE_VALUE] = HAS_ESCAPES_FLAG, [_LOCK_OBJECT] = HAS_DEOPT_FLAG, [_STORE_ATTR_WITH_HINT] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, - [_STORE_ATTR_SLOT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, + [_STORE_ATTR_SLOT] = HAS_ESCAPES_FLAG, [_COMPARE_OP] = HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_COMPARE_OP_FLOAT] = HAS_ARG_FLAG, [_COMPARE_OP_INT] = HAS_ARG_FLAG, diff --git a/Lib/test/test_free_threading/test_slots.py b/Lib/test/test_free_threading/test_slots.py index a73525e1bebfb49..78a455a2fdd6d37 100644 --- a/Lib/test/test_free_threading/test_slots.py +++ b/Lib/test/test_free_threading/test_slots.py @@ -304,3 +304,21 @@ def reader(): spam_new.T_CHAR run_in_threads([writer, reader]) + + def test_T_OBJECT(self): + # _Py_T_OBJECT exercises PyMember_GetOne / PyMember_SetOne for + # the object case from concurrent threads. Uses + # _testcapi.HeapCTypeWithDict which exposes its `dict` slot as a + # `_Py_T_OBJECT` member named `dictobj`. + obj = _testcapi.HeapCTypeWithDict() + + def writer(): + for i in range(1_000): + obj.dictobj = {"k": i} + + def reader(): + for _ in range(1_000): + v = obj.dictobj + assert v is None or isinstance(v, dict) + + run_in_threads([writer, reader]) diff --git a/Modules/_testinternalcapi/test_cases.c.h b/Modules/_testinternalcapi/test_cases.c.h index a2506524f0bb6dc..020b2c6c067c653 100644 --- a/Modules/_testinternalcapi/test_cases.c.h +++ b/Modules/_testinternalcapi/test_cases.c.h @@ -11679,16 +11679,15 @@ value = stack_pointer[-2]; uint16_t index = read_u16(&this_instr[4].cache); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); - if (!LOCK_OBJECT(owner_o)) { - UPDATE_MISS_STATS(STORE_ATTR); - assert(_PyOpcode_Deopt[opcode] == (STORE_ATTR)); - JUMP_TO_PREDICTED(STORE_ATTR); - } char *addr = (char *)owner_o + index; STAT_INC(STORE_ATTR, hit); + #ifdef Py_GIL_DISABLED + PyObject *old_value = _Py_atomic_exchange_ptr( + (void **)addr, PyStackRef_AsPyObjectSteal(value)); + #else PyObject *old_value = *(PyObject **)addr; - FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, PyStackRef_AsPyObjectSteal(value)); - UNLOCK_OBJECT(owner_o); + *(PyObject **)addr = PyStackRef_AsPyObjectSteal(value); + #endif o = owner; stack_pointer[-2] = o; stack_pointer += -1; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index f7487c7136962f1..166b16ee9cd165f 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3225,14 +3225,21 @@ dummy_func( op(_STORE_ATTR_SLOT, (index/1, value, owner -- o)) { PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); - DEOPT_IF(!LOCK_OBJECT(owner_o)); char *addr = (char *)owner_o + index; STAT_INC(STORE_ATTR, hit); + // Atomic exchange of the slot pointer. Each concurrent writer + // observes a unique old value, so Py_XDECREF below cannot + // double-free. Concurrent readers (_LOAD_ATTR_SLOT, + // PyMember_GetOne) cope via _Py_TryIncrefCompare. +#ifdef Py_GIL_DISABLED + PyObject *old_value = _Py_atomic_exchange_ptr( + (void **)addr, PyStackRef_AsPyObjectSteal(value)); +#else PyObject *old_value = *(PyObject **)addr; - FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, PyStackRef_AsPyObjectSteal(value)); - UNLOCK_OBJECT(owner_o); - INPUTS_DEAD(); + *(PyObject **)addr = PyStackRef_AsPyObjectSteal(value); +#endif o = owner; + DEAD(owner); Py_XDECREF(old_value); } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index efa61d7de74e88c..f9fd21019d17db4 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -13066,18 +13066,15 @@ value = _stack_item_0; uint16_t index = (uint16_t)CURRENT_OPERAND0_16(); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); - if (!LOCK_OBJECT(owner_o)) { - UOP_STAT_INC(uopcode, miss); - _tos_cache1 = owner; - _tos_cache0 = value; - SET_CURRENT_CACHED_VALUES(2); - JUMP_TO_JUMP_TARGET(); - } char *addr = (char *)owner_o + index; STAT_INC(STORE_ATTR, hit); + #ifdef Py_GIL_DISABLED + PyObject *old_value = _Py_atomic_exchange_ptr( + (void **)addr, PyStackRef_AsPyObjectSteal(value)); + #else PyObject *old_value = *(PyObject **)addr; - FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, PyStackRef_AsPyObjectSteal(value)); - UNLOCK_OBJECT(owner_o); + *(PyObject **)addr = PyStackRef_AsPyObjectSteal(value); + #endif o = owner; stack_pointer[0] = o; stack_pointer += 1; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 53e09a8f4523c7c..b3a9b1f57e46d61 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -11676,16 +11676,15 @@ value = stack_pointer[-2]; uint16_t index = read_u16(&this_instr[4].cache); PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner); - if (!LOCK_OBJECT(owner_o)) { - UPDATE_MISS_STATS(STORE_ATTR); - assert(_PyOpcode_Deopt[opcode] == (STORE_ATTR)); - JUMP_TO_PREDICTED(STORE_ATTR); - } char *addr = (char *)owner_o + index; STAT_INC(STORE_ATTR, hit); + #ifdef Py_GIL_DISABLED + PyObject *old_value = _Py_atomic_exchange_ptr( + (void **)addr, PyStackRef_AsPyObjectSteal(value)); + #else PyObject *old_value = *(PyObject **)addr; - FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, PyStackRef_AsPyObjectSteal(value)); - UNLOCK_OBJECT(owner_o); + *(PyObject **)addr = PyStackRef_AsPyObjectSteal(value); + #endif o = owner; stack_pointer[-2] = o; stack_pointer += -1; diff --git a/Python/structmember.c b/Python/structmember.c index adea8216b8796b5..5ca7d7e0a607448 100644 --- a/Python/structmember.c +++ b/Python/structmember.c @@ -97,36 +97,33 @@ PyMember_GetOne(const char *obj_addr, PyMemberDef *l) break; } case _Py_T_OBJECT: - v = FT_ATOMIC_LOAD_PTR(*(PyObject **) addr); - if (v != NULL) { #ifdef Py_GIL_DISABLED - if (!_Py_TryIncrefCompare((PyObject **) addr, v)) { - Py_BEGIN_CRITICAL_SECTION((PyObject *) obj_addr); - v = FT_ATOMIC_LOAD_PTR(*(PyObject **) addr); - Py_XINCREF(v); - Py_END_CRITICAL_SECTION(); - } + // _Py_XGetRef does an atomic load + try-incref loop, retrying when + // a concurrent writer (lock-free atomic exchange in _STORE_ATTR_SLOT + // and PyMember_SetOne) mutates the slot. The previous critical- + // section fallback assumed writers also took ob_mutex, which they no + // longer do, so a plain Py_XINCREF inside the CS could resurrect a + // refcount-0 (about-to-be-freed) object. + v = _Py_XGetRef((PyObject **)addr); #else - Py_INCREF(v); + v = *(PyObject **)addr; + Py_XINCREF(v); #endif - } if (v == NULL) { - v = Py_None; + v = Py_None; // immortal, no incref needed } break; case Py_T_OBJECT_EX: +#ifdef Py_GIL_DISABLED + v = _Py_XGetRef((PyObject **)addr); + if (v == NULL) { + PyErr_Format(PyExc_AttributeError, + "'%T' object has no attribute '%s'", + (PyObject *)obj_addr, l->name); + } +#else v = member_get_object(addr, obj_addr, l); -#ifndef Py_GIL_DISABLED Py_XINCREF(v); -#else - if (v != NULL) { - if (!_Py_TryIncrefCompare((PyObject **) addr, v)) { - Py_BEGIN_CRITICAL_SECTION((PyObject *) obj_addr); - v = member_get_object(addr, obj_addr, l); - Py_XINCREF(v); - Py_END_CRITICAL_SECTION(); - } - } #endif break; case Py_T_LONGLONG: @@ -320,11 +317,15 @@ PyMember_SetOne(char *addr, PyMemberDef *l, PyObject *v) break; } case _Py_T_OBJECT: - case Py_T_OBJECT_EX: - Py_BEGIN_CRITICAL_SECTION(obj); + case Py_T_OBJECT_EX: { + // Lock-free atomic exchange; matches _STORE_ATTR_SLOT. + PyObject *new_v = Py_XNewRef(v); +#ifdef Py_GIL_DISABLED + oldv = (PyObject *)_Py_atomic_exchange_ptr((void **)addr, new_v); +#else oldv = *(PyObject **)addr; - FT_ATOMIC_STORE_PTR_RELEASE(*(PyObject **)addr, Py_XNewRef(v)); - Py_END_CRITICAL_SECTION(); + *(PyObject **)addr = new_v; +#endif if (v == NULL && oldv == NULL && l->type == Py_T_OBJECT_EX) { // Raise an exception when attempting to delete an already deleted // attribute. @@ -336,6 +337,7 @@ PyMember_SetOne(char *addr, PyMemberDef *l, PyObject *v) } Py_XDECREF(oldv); break; + } case Py_T_CHAR: { const char *string; Py_ssize_t len; diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index 42459eedad6b1d7..778f401b4f581cf 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -700,6 +700,7 @@ def has_error_without_pop(op: parser.CodeDef) -> bool: "_Py_TryIncrefCompare", "_Py_TryIncrefCompareStackRef", "_Py_atomic_compare_exchange_uint8", + "_Py_atomic_exchange_ptr", "_Py_atomic_load_ptr_acquire", "_Py_atomic_load_uintptr_relaxed", "_Py_set_eval_breaker_bit",