diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index a7005a3b8e2fab..950547cb002f4c 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -114,6 +114,7 @@ extern Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject extern int _PyDict_GetMethodStackRef(PyDictObject *dict, PyObject *name, _PyStackRef *method); +extern Py_ssize_t _PyDict_LookupIndexAndValue(PyDictObject *, PyObject *, PyObject **); extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *); extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key); diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst new file mode 100644 index 00000000000000..bbc9611b748fde --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst @@ -0,0 +1,6 @@ +If we are specializing to ``LOAD_GLOBAL_MODULE`` or ``LOAD_ATTR_MODULE``, try +to enable deferred reference counting for the value, if the object is owned by +a different thread. This applies to the free-threaded build only and should +improve scaling of multi-threaded programs. Note that when deferred reference +counting is enabled, the object will be deallocated by the GC, rather than by +:c:func:`Py_DECREF`. diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 5a2bb7d3d8cd2d..b1fa9c48138b1a 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -2346,10 +2346,9 @@ dict_unhashable_type(PyObject *key) } Py_ssize_t -_PyDict_LookupIndex(PyDictObject *mp, PyObject *key) +_PyDict_LookupIndexAndValue(PyDictObject *mp, PyObject *key, PyObject **value) { // TODO: Thread safety - PyObject *value; assert(PyDict_CheckExact((PyObject*)mp)); assert(PyUnicode_CheckExact(key)); @@ -2359,7 +2358,14 @@ _PyDict_LookupIndex(PyDictObject *mp, PyObject *key) return -1; } - return _Py_dict_lookup(mp, key, hash, &value); + return _Py_dict_lookup(mp, key, hash, value); +} + +Py_ssize_t +_PyDict_LookupIndex(PyDictObject *mp, PyObject *key) +{ + PyObject *value; // discarded + return _PyDict_LookupIndexAndValue(mp, key, &value); } /* Same as PyDict_GetItemWithError() but with hash supplied by caller. diff --git a/Python/specialize.c b/Python/specialize.c index 19433bc7a74319..f647bf6e26c117 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -355,6 +355,21 @@ static int function_kind(PyCodeObject *code); static bool function_check_args(PyObject *o, int expected_argcount, int opcode); static uint32_t function_get_version(PyObject *o, int opcode); +#ifdef Py_GIL_DISABLED +static void +maybe_enable_deferred_ref_count(PyObject *op) +{ + if (!_Py_IsOwnedByCurrentThread(op)) { + // For module level variables that are heavily used from multiple + // threads, deferred reference counting provides good scaling + // benefits. The downside is that the object will only be deallocated + // by a GC run. + PyUnstable_Object_EnableDeferredRefcount(op); + } +} +#endif + + static int specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, PyObject *name) { @@ -369,7 +384,8 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_ATTR_MODULE_ATTR_NOT_FOUND); return -1; } - index = _PyDict_LookupIndex(dict, name); + PyObject *value; + index = _PyDict_LookupIndexAndValue(dict, name, &value); assert (index != DKIX_ERROR); if (index != (uint16_t)index) { SPECIALIZATION_FAIL(LOAD_ATTR, @@ -384,6 +400,9 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_OUT_OF_VERSIONS); return -1; } +#ifdef Py_GIL_DISABLED + maybe_enable_deferred_ref_count(value); +#endif write_u32(cache->version, keys_version); cache->index = (uint16_t)index; specialize(instr, LOAD_ATTR_MODULE); @@ -1264,7 +1283,6 @@ specialize_attr_loadclassattr(PyObject *owner, _Py_CODEUNIT *instr, return 1; } - static void specialize_load_global_lock_held( PyObject *globals, PyObject *builtins, @@ -1284,7 +1302,12 @@ specialize_load_global_lock_held( SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_LOAD_GLOBAL_NON_STRING_OR_SPLIT); goto fail; } +#ifdef Py_GIL_DISABLED + PyObject *value; + Py_ssize_t index = _PyDict_LookupIndexAndValue((PyDictObject *)globals, name, &value); +#else Py_ssize_t index = _PyDictKeys_StringLookup(globals_keys, name); +#endif if (index == DKIX_ERROR) { SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_EXPECTED_ERROR); goto fail; @@ -1305,6 +1328,9 @@ specialize_load_global_lock_held( SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_OUT_OF_RANGE); goto fail; } +#ifdef Py_GIL_DISABLED + maybe_enable_deferred_ref_count(value); +#endif cache->index = (uint16_t)index; cache->module_keys_version = (uint16_t)keys_version; specialize(instr, LOAD_GLOBAL_MODULE); diff --git a/Tools/ftscalingbench/ftscalingbench.py b/Tools/ftscalingbench/ftscalingbench.py index 097a065f368f30..c2bd7c3880bc90 100644 --- a/Tools/ftscalingbench/ftscalingbench.py +++ b/Tools/ftscalingbench/ftscalingbench.py @@ -21,6 +21,7 @@ # > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost # +import copy import math import os import queue @@ -214,6 +215,14 @@ def instantiate_dataclass(): for _ in range(1000 * WORK_SCALE): obj = MyDataClass(x=1, y=2, z=3) + +@register_benchmark +def deepcopy(): + x = {'list': [1, 2], 'tuple': (1, None)} + for i in range(40 * WORK_SCALE): + copy.deepcopy(x) + + def bench_one_thread(func): t0 = time.perf_counter_ns() func()