Skip to content

Commit b2a97bb

Browse files
committed
Reduce memory arena contention
Previously there was one memory arena for all threads, making it the bottleneck for multi-threaded performance. As the number of threads increased, the contention for the lock on the arena would grow, causing other threads to wait to acquire it. This commit makes it use 8 memory arenas, and round-robbins how they are assigned to threads. Threads keep track of the index that they should use into the arena array, assigned the first time the arena is accessed on a given thread. When an image is first created, it is allocated from an arena. When the logic to have multiple arenas is enabled, it then keeps track of the index on the image, so that when deleted it can be returned to the correct arena. Effectively this means that in single-threaded programs, this should not really have an effect. We also do not do this logic if the GIL is enabled, as it effectively acts as the lock on the default arena for us. As expected, this approach has no real noticable effect on regular CPython. On free-threaded CPython, however, there is a massive difference (measuring up to about 70%).
1 parent 51df142 commit b2a97bb

File tree

4 files changed

+284
-68
lines changed

4 files changed

+284
-68
lines changed

setup.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,21 @@
88
# ------------------------------
99
from __future__ import annotations
1010

11+
import distutils.ccompiler
1112
import os
1213
import re
1314
import shutil
1415
import struct
1516
import subprocess
1617
import sys
18+
import tempfile
1719
import warnings
1820
from collections.abc import Iterator
1921
from typing import Any
2022

2123
from setuptools import Extension, setup
2224
from setuptools.command.build_ext import build_ext
25+
from setuptools.errors import CompileError
2326

2427

2528
def get_version() -> str:
@@ -292,6 +295,47 @@ def _pkg_config(name: str) -> tuple[list[str], list[str]] | None:
292295
return None
293296

294297

298+
def _try_compile(compiler: distutils.ccompiler.CCompiler, code: str) -> bool:
299+
try:
300+
with tempfile.TemporaryDirectory() as d:
301+
fn = os.path.join(d, "test.c")
302+
with open(fn, "w") as f:
303+
f.write(code)
304+
compiler.compile([fn], output_dir=d, extra_preargs=["-Werror"])
305+
return True
306+
except CompileError:
307+
return False
308+
309+
310+
def _try_compile_attr(compiler: distutils.ccompiler.CCompiler, attr: str) -> bool:
311+
code = f"""
312+
#pragma GCC diagnostic error "-Wattributes"
313+
#pragma clang diagnostic error "-Wattributes"
314+
315+
int {attr} foo;
316+
int main() {{
317+
return 0;
318+
}}
319+
"""
320+
321+
return _try_compile(compiler, code)
322+
323+
324+
def _try_compile_tls_define_macros(
325+
compiler: distutils.ccompiler.CCompiler,
326+
) -> list[tuple[str, str | None]]:
327+
if _try_compile_attr(compiler, "thread_local"): # C23
328+
return [("HAVE_THREAD_LOCAL", None)]
329+
elif _try_compile_attr(compiler, "_Thread_local"): # C11/C17
330+
return [("HAVE__THREAD_LOCAL", None)]
331+
elif _try_compile_attr(compiler, "__thread"): # GCC/clang
332+
return [("HAVE___THREAD", None)]
333+
elif _try_compile_attr(compiler, "__declspec(thread)"): # MSVC
334+
return [("HAVE___DECLSPEC_THREAD_", None)]
335+
else:
336+
return []
337+
338+
295339
class pil_build_ext(build_ext):
296340
class ext_feature:
297341
features = [
@@ -426,13 +470,14 @@ def finalize_options(self) -> None:
426470
def _update_extension(
427471
self,
428472
name: str,
429-
libraries: list[str] | list[str | bool | None],
473+
libraries: list[str] | list[str | bool | None] | None = None,
430474
define_macros: list[tuple[str, str | None]] | None = None,
431475
sources: list[str] | None = None,
432476
) -> None:
433477
for extension in self.extensions:
434478
if extension.name == name:
435-
extension.libraries += libraries
479+
if libraries is not None:
480+
extension.libraries += libraries
436481
if define_macros is not None:
437482
extension.define_macros += define_macros
438483
if sources is not None:
@@ -890,7 +935,10 @@ def build_extensions(self) -> None:
890935

891936
defs.append(("PILLOW_VERSION", f'"{PILLOW_VERSION}"'))
892937

893-
self._update_extension("PIL._imaging", libs, defs)
938+
tls_define_macros = _try_compile_tls_define_macros(self.compiler)
939+
self._update_extension("PIL._imaging", libs, defs + tls_define_macros)
940+
self._update_extension("PIL._imagingmath", define_macros=tls_define_macros)
941+
self._update_extension("PIL._imagingmorph", define_macros=tls_define_macros)
894942

895943
#
896944
# additional libraries
@@ -913,7 +961,9 @@ def build_extensions(self) -> None:
913961
libs.append(feature.get("fribidi"))
914962
else: # building FriBiDi shim from src/thirdparty
915963
srcs.append("src/thirdparty/fribidi-shim/fribidi.c")
916-
self._update_extension("PIL._imagingft", libs, defs, srcs)
964+
self._update_extension(
965+
"PIL._imagingft", libs, defs + tls_define_macros, srcs
966+
)
917967

918968
else:
919969
self._remove_extension("PIL._imagingft")
@@ -922,19 +972,19 @@ def build_extensions(self) -> None:
922972
libs = [feature.get("lcms")]
923973
if sys.platform == "win32":
924974
libs.extend(["user32", "gdi32"])
925-
self._update_extension("PIL._imagingcms", libs)
975+
self._update_extension("PIL._imagingcms", libs, tls_define_macros)
926976
else:
927977
self._remove_extension("PIL._imagingcms")
928978

929979
webp = feature.get("webp")
930980
if isinstance(webp, str):
931981
libs = [webp, webp + "mux", webp + "demux"]
932-
self._update_extension("PIL._webp", libs)
982+
self._update_extension("PIL._webp", libs, tls_define_macros)
933983
else:
934984
self._remove_extension("PIL._webp")
935985

936986
tk_libs = ["psapi"] if sys.platform in ("win32", "cygwin") else []
937-
self._update_extension("PIL._imagingtk", tk_libs)
987+
self._update_extension("PIL._imagingtk", tk_libs, tls_define_macros)
938988

939989
build_ext.build_extensions(self)
940990

src/_imaging.c

Lines changed: 75 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3938,34 +3938,49 @@ _get_stats(PyObject *self, PyObject *args) {
39383938
return NULL;
39393939
}
39403940

3941-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
3942-
ImagingMemoryArena arena = &ImagingDefaultArena;
3943-
3944-
v = PyLong_FromLong(arena->stats_new_count);
3941+
long stats_new_count = 0;
3942+
long stats_allocated_blocks = 0;
3943+
long stats_reused_blocks = 0;
3944+
long stats_reallocated_blocks = 0;
3945+
long stats_freed_blocks = 0;
3946+
long blocks_cached = 0;
3947+
3948+
ImagingMemoryArena arena;
3949+
IMAGING_ARENAS_FOREACH(arena) {
3950+
MUTEX_LOCK(&arena->mutex);
3951+
stats_new_count += arena->stats_new_count;
3952+
stats_allocated_blocks += arena->stats_allocated_blocks;
3953+
stats_reused_blocks += arena->stats_reused_blocks;
3954+
stats_reallocated_blocks += arena->stats_reallocated_blocks;
3955+
stats_freed_blocks += arena->stats_freed_blocks;
3956+
blocks_cached += arena->blocks_cached;
3957+
MUTEX_UNLOCK(&arena->mutex);
3958+
}
3959+
3960+
v = PyLong_FromLong(stats_new_count);
39453961
PyDict_SetItemString(d, "new_count", v ? v : Py_None);
39463962
Py_XDECREF(v);
39473963

3948-
v = PyLong_FromLong(arena->stats_allocated_blocks);
3964+
v = PyLong_FromLong(stats_allocated_blocks);
39493965
PyDict_SetItemString(d, "allocated_blocks", v ? v : Py_None);
39503966
Py_XDECREF(v);
39513967

3952-
v = PyLong_FromLong(arena->stats_reused_blocks);
3968+
v = PyLong_FromLong(stats_reused_blocks);
39533969
PyDict_SetItemString(d, "reused_blocks", v ? v : Py_None);
39543970
Py_XDECREF(v);
39553971

3956-
v = PyLong_FromLong(arena->stats_reallocated_blocks);
3972+
v = PyLong_FromLong(stats_reallocated_blocks);
39573973
PyDict_SetItemString(d, "reallocated_blocks", v ? v : Py_None);
39583974
Py_XDECREF(v);
39593975

3960-
v = PyLong_FromLong(arena->stats_freed_blocks);
3976+
v = PyLong_FromLong(stats_freed_blocks);
39613977
PyDict_SetItemString(d, "freed_blocks", v ? v : Py_None);
39623978
Py_XDECREF(v);
39633979

3964-
v = PyLong_FromLong(arena->blocks_cached);
3980+
v = PyLong_FromLong(blocks_cached);
39653981
PyDict_SetItemString(d, "blocks_cached", v ? v : Py_None);
39663982
Py_XDECREF(v);
39673983

3968-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
39693984
return d;
39703985
}
39713986

@@ -3975,14 +3990,16 @@ _reset_stats(PyObject *self, PyObject *args) {
39753990
return NULL;
39763991
}
39773992

3978-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
3979-
ImagingMemoryArena arena = &ImagingDefaultArena;
3980-
arena->stats_new_count = 0;
3981-
arena->stats_allocated_blocks = 0;
3982-
arena->stats_reused_blocks = 0;
3983-
arena->stats_reallocated_blocks = 0;
3984-
arena->stats_freed_blocks = 0;
3985-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
3993+
ImagingMemoryArena arena;
3994+
IMAGING_ARENAS_FOREACH(arena) {
3995+
MUTEX_LOCK(&arena->mutex);
3996+
arena->stats_new_count = 0;
3997+
arena->stats_allocated_blocks = 0;
3998+
arena->stats_reused_blocks = 0;
3999+
arena->stats_reallocated_blocks = 0;
4000+
arena->stats_freed_blocks = 0;
4001+
MUTEX_UNLOCK(&arena->mutex);
4002+
}
39864003

39874004
Py_INCREF(Py_None);
39884005
return Py_None;
@@ -3994,9 +4011,10 @@ _get_alignment(PyObject *self, PyObject *args) {
39944011
return NULL;
39954012
}
39964013

3997-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
3998-
int alignment = ImagingDefaultArena.alignment;
3999-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4014+
ImagingMemoryArena arena = ImagingGetArena();
4015+
MUTEX_LOCK(&arena->mutex);
4016+
int alignment = arena->alignment;
4017+
MUTEX_UNLOCK(&arena->mutex);
40004018
return PyLong_FromLong(alignment);
40014019
}
40024020

@@ -4006,9 +4024,10 @@ _get_block_size(PyObject *self, PyObject *args) {
40064024
return NULL;
40074025
}
40084026

4009-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
4010-
int block_size = ImagingDefaultArena.block_size;
4011-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4027+
ImagingMemoryArena arena = ImagingGetArena();
4028+
MUTEX_LOCK(&arena->mutex);
4029+
int block_size = arena->block_size;
4030+
MUTEX_UNLOCK(&arena->mutex);
40124031
return PyLong_FromLong(block_size);
40134032
}
40144033

@@ -4018,9 +4037,10 @@ _get_blocks_max(PyObject *self, PyObject *args) {
40184037
return NULL;
40194038
}
40204039

4021-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
4022-
int blocks_max = ImagingDefaultArena.blocks_max;
4023-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4040+
ImagingMemoryArena arena = ImagingGetArena();
4041+
MUTEX_LOCK(&arena->mutex);
4042+
int blocks_max = arena->blocks_max;
4043+
MUTEX_UNLOCK(&arena->mutex);
40244044
return PyLong_FromLong(blocks_max);
40254045
}
40264046

@@ -4041,9 +4061,12 @@ _set_alignment(PyObject *self, PyObject *args) {
40414061
return NULL;
40424062
}
40434063

4044-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
4045-
ImagingDefaultArena.alignment = alignment;
4046-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4064+
ImagingMemoryArena arena;
4065+
IMAGING_ARENAS_FOREACH(arena) {
4066+
MUTEX_LOCK(&arena->mutex);
4067+
arena->alignment = alignment;
4068+
MUTEX_UNLOCK(&arena->mutex);
4069+
}
40474070

40484071
Py_INCREF(Py_None);
40494072
return Py_None;
@@ -4066,9 +4089,12 @@ _set_block_size(PyObject *self, PyObject *args) {
40664089
return NULL;
40674090
}
40684091

4069-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
4070-
ImagingDefaultArena.block_size = block_size;
4071-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4092+
ImagingMemoryArena arena;
4093+
IMAGING_ARENAS_FOREACH(arena) {
4094+
MUTEX_LOCK(&arena->mutex);
4095+
arena->block_size = block_size;
4096+
MUTEX_UNLOCK(&arena->mutex);
4097+
}
40724098

40734099
Py_INCREF(Py_None);
40744100
return Py_None;
@@ -4087,15 +4113,20 @@ _set_blocks_max(PyObject *self, PyObject *args) {
40874113
}
40884114

40894115
if ((unsigned long)blocks_max >
4090-
SIZE_MAX / sizeof(ImagingDefaultArena.blocks_pool[0])) {
4116+
SIZE_MAX / sizeof(ImagingGetArena()->blocks_pool[0])) {
40914117
PyErr_SetString(PyExc_ValueError, "blocks_max is too large");
40924118
return NULL;
40934119
}
40944120

4095-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
4096-
int status = ImagingMemorySetBlocksMax(&ImagingDefaultArena, blocks_max);
4097-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4098-
if (!status) {
4121+
int error = 0;
4122+
ImagingMemoryArena arena;
4123+
IMAGING_ARENAS_FOREACH(arena) {
4124+
MUTEX_LOCK(&arena->mutex);
4125+
error |= ImagingMemorySetBlocksMax(arena, blocks_max);
4126+
MUTEX_UNLOCK(&arena->mutex);
4127+
}
4128+
4129+
if (error) {
40994130
return ImagingError_MemoryError();
41004131
}
41014132

@@ -4111,9 +4142,12 @@ _clear_cache(PyObject *self, PyObject *args) {
41114142
return NULL;
41124143
}
41134144

4114-
MUTEX_LOCK(&ImagingDefaultArena.mutex);
4115-
ImagingMemoryClearCache(&ImagingDefaultArena, i);
4116-
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
4145+
ImagingMemoryArena arena;
4146+
IMAGING_ARENAS_FOREACH(arena) {
4147+
MUTEX_LOCK(&arena->mutex);
4148+
ImagingMemoryClearCache(arena, i);
4149+
MUTEX_UNLOCK(&arena->mutex);
4150+
}
41174151

41184152
Py_INCREF(Py_None);
41194153
return Py_None;

0 commit comments

Comments
 (0)