Skip to content

Commit f8290df

Browse files
authored
gh-116738: Make _codecs module thread-safe (#117530)
The module itself is a thin wrapper around calls to functions in `Python/codecs.c`, so that's where the meaningful changes happened: - Move codecs-related state that lives on `PyInterpreterState` to a struct declared in `pycore_codecs.h`. - In free-threaded builds, add a mutex to `codecs_state` to synchronize operations on `search_path`. Because `search_path_mutex` is used as a normal mutex and not a critical section, we must be extremely careful with operations called while holding it. - The codec registry is explicitly initialized as part of `_PyUnicode_InitEncodings` to simplify thread-safety.
1 parent 4e2caf2 commit f8290df

File tree

6 files changed

+120
-79
lines changed

6 files changed

+120
-79
lines changed

Include/internal/pycore_codecs.h

+31
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ extern "C" {
88
# error "this header requires Py_BUILD_CORE define"
99
#endif
1010

11+
#include "pycore_lock.h" // PyMutex
12+
13+
/* Initialize codecs-related state for the given interpreter, including
14+
registering the first codec search function. Must be called before any other
15+
PyCodec-related functions, and while only one thread is active. */
16+
extern PyStatus _PyCodec_InitRegistry(PyInterpreterState *interp);
17+
18+
/* Finalize codecs-related state for the given interpreter. No PyCodec-related
19+
functions other than PyCodec_Unregister() may be called after this. */
20+
extern void _PyCodec_Fini(PyInterpreterState *interp);
21+
1122
extern PyObject* _PyCodec_Lookup(const char *encoding);
1223

1324
/* Text codec specific encoding and decoding API.
@@ -48,6 +59,26 @@ extern PyObject* _PyCodecInfo_GetIncrementalEncoder(
4859
PyObject *codec_info,
4960
const char *errors);
5061

62+
// Per-interpreter state used by codecs.c.
63+
struct codecs_state {
64+
// A list of callable objects used to search for codecs.
65+
PyObject *search_path;
66+
67+
// A dict mapping codec names to codecs returned from a callable in
68+
// search_path.
69+
PyObject *search_cache;
70+
71+
// A dict mapping error handling strategies to functions to implement them.
72+
PyObject *error_registry;
73+
74+
#ifdef Py_GIL_DISABLED
75+
// Used to safely delete a specific item from search_path.
76+
PyMutex search_path_mutex;
77+
#endif
78+
79+
// Whether or not the rest of the state is initialized.
80+
int initialized;
81+
};
5182

5283
#ifdef __cplusplus
5384
}

Include/internal/pycore_interp.h

+2-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ extern "C" {
1414
#include "pycore_atexit.h" // struct atexit_state
1515
#include "pycore_ceval_state.h" // struct _ceval_state
1616
#include "pycore_code.h" // struct callable_cache
17+
#include "pycore_codecs.h" // struct codecs_state
1718
#include "pycore_context.h" // struct _Py_context_state
1819
#include "pycore_crossinterp.h" // struct _xidregistry
1920
#include "pycore_dict_state.h" // struct _Py_dict_state
@@ -182,10 +183,7 @@ struct _is {
182183
possible to facilitate out-of-process observability
183184
tools. */
184185

185-
PyObject *codec_search_path;
186-
PyObject *codec_search_cache;
187-
PyObject *codec_error_registry;
188-
int codecs_initialized;
186+
struct codecs_state codecs;
189187

190188
PyConfig config;
191189
unsigned long feature_flags;

Objects/unicodeobject.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -15441,7 +15441,11 @@ init_fs_encoding(PyThreadState *tstate)
1544115441
PyStatus
1544215442
_PyUnicode_InitEncodings(PyThreadState *tstate)
1544315443
{
15444-
PyStatus status = init_fs_encoding(tstate);
15444+
PyStatus status = _PyCodec_InitRegistry(tstate->interp);
15445+
if (_PyStatus_EXCEPTION(status)) {
15446+
return status;
15447+
}
15448+
status = init_fs_encoding(tstate);
1544515449
if (_PyStatus_EXCEPTION(status)) {
1544615450
return status;
1544715451
}

Python/codecs.c

+80-70
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Copyright (c) Corporation for National Research Initiatives.
1111
#include "Python.h"
1212
#include "pycore_call.h" // _PyObject_CallNoArgs()
1313
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
14+
#include "pycore_lock.h" // PyMutex
1415
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
1516
#include "pycore_pystate.h" // _PyInterpreterState_GET()
1617
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
@@ -19,24 +20,10 @@ const char *Py_hexdigits = "0123456789abcdef";
1920

2021
/* --- Codec Registry ----------------------------------------------------- */
2122

22-
/* Import the standard encodings package which will register the first
23-
codec search function.
24-
25-
This is done in a lazy way so that the Unicode implementation does
26-
not downgrade startup time of scripts not needing it.
27-
28-
ImportErrors are silently ignored by this function. Only one try is
29-
made.
30-
31-
*/
32-
33-
static int _PyCodecRegistry_Init(void); /* Forward */
34-
3523
int PyCodec_Register(PyObject *search_function)
3624
{
3725
PyInterpreterState *interp = _PyInterpreterState_GET();
38-
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
39-
goto onError;
26+
assert(interp->codecs.initialized);
4027
if (search_function == NULL) {
4128
PyErr_BadArgument();
4229
goto onError;
@@ -45,7 +32,14 @@ int PyCodec_Register(PyObject *search_function)
4532
PyErr_SetString(PyExc_TypeError, "argument must be callable");
4633
goto onError;
4734
}
48-
return PyList_Append(interp->codec_search_path, search_function);
35+
#ifdef Py_GIL_DISABLED
36+
PyMutex_Lock(&interp->codecs.search_path_mutex);
37+
#endif
38+
int ret = PyList_Append(interp->codecs.search_path, search_function);
39+
#ifdef Py_GIL_DISABLED
40+
PyMutex_Unlock(&interp->codecs.search_path_mutex);
41+
#endif
42+
return ret;
4943

5044
onError:
5145
return -1;
@@ -55,22 +49,34 @@ int
5549
PyCodec_Unregister(PyObject *search_function)
5650
{
5751
PyInterpreterState *interp = _PyInterpreterState_GET();
58-
PyObject *codec_search_path = interp->codec_search_path;
59-
/* Do nothing if codec_search_path is not created yet or was cleared. */
60-
if (codec_search_path == NULL) {
52+
if (interp->codecs.initialized != 1) {
53+
/* Do nothing if codecs state was cleared (only possible during
54+
interpreter shutdown). */
6155
return 0;
6256
}
6357

58+
PyObject *codec_search_path = interp->codecs.search_path;
6459
assert(PyList_CheckExact(codec_search_path));
65-
Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
66-
for (Py_ssize_t i = 0; i < n; i++) {
67-
PyObject *item = PyList_GET_ITEM(codec_search_path, i);
60+
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
61+
#ifdef Py_GIL_DISABLED
62+
PyMutex_Lock(&interp->codecs.search_path_mutex);
63+
#endif
64+
PyObject *item = PyList_GetItemRef(codec_search_path, i);
65+
int ret = 1;
6866
if (item == search_function) {
69-
if (interp->codec_search_cache != NULL) {
70-
assert(PyDict_CheckExact(interp->codec_search_cache));
71-
PyDict_Clear(interp->codec_search_cache);
72-
}
73-
return PyList_SetSlice(codec_search_path, i, i+1, NULL);
67+
// We hold a reference to the item, so its destructor can't run
68+
// while we hold search_path_mutex.
69+
ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
70+
}
71+
#ifdef Py_GIL_DISABLED
72+
PyMutex_Unlock(&interp->codecs.search_path_mutex);
73+
#endif
74+
Py_DECREF(item);
75+
if (ret != 1) {
76+
assert(interp->codecs.search_cache != NULL);
77+
assert(PyDict_CheckExact(interp->codecs.search_cache));
78+
PyDict_Clear(interp->codecs.search_cache);
79+
return ret;
7480
}
7581
}
7682
return 0;
@@ -132,9 +138,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
132138
}
133139

134140
PyInterpreterState *interp = _PyInterpreterState_GET();
135-
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
136-
return NULL;
137-
}
141+
assert(interp->codecs.initialized);
138142

139143
/* Convert the encoding to a normalized Python string: all
140144
characters are converted to lower case, spaces and hyphens are
@@ -147,7 +151,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
147151

148152
/* First, try to lookup the name in the registry dictionary */
149153
PyObject *result;
150-
if (PyDict_GetItemRef(interp->codec_search_cache, v, &result) < 0) {
154+
if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
151155
goto onError;
152156
}
153157
if (result != NULL) {
@@ -156,7 +160,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
156160
}
157161

158162
/* Next, scan the search functions in order of registration */
159-
const Py_ssize_t len = PyList_Size(interp->codec_search_path);
163+
const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
160164
if (len < 0)
161165
goto onError;
162166
if (len == 0) {
@@ -170,14 +174,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
170174
for (i = 0; i < len; i++) {
171175
PyObject *func;
172176

173-
func = PyList_GetItem(interp->codec_search_path, i);
177+
func = PyList_GetItemRef(interp->codecs.search_path, i);
174178
if (func == NULL)
175179
goto onError;
176180
result = PyObject_CallOneArg(func, v);
181+
Py_DECREF(func);
177182
if (result == NULL)
178183
goto onError;
179184
if (result == Py_None) {
180-
Py_DECREF(result);
185+
Py_CLEAR(result);
181186
continue;
182187
}
183188
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
@@ -188,15 +193,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
188193
}
189194
break;
190195
}
191-
if (i == len) {
196+
if (result == NULL) {
192197
/* XXX Perhaps we should cache misses too ? */
193198
PyErr_Format(PyExc_LookupError,
194199
"unknown encoding: %s", encoding);
195200
goto onError;
196201
}
197202

198203
/* Cache and return the result */
199-
if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
204+
if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
200205
Py_DECREF(result);
201206
goto onError;
202207
}
@@ -600,13 +605,12 @@ PyObject *_PyCodec_DecodeText(PyObject *object,
600605
int PyCodec_RegisterError(const char *name, PyObject *error)
601606
{
602607
PyInterpreterState *interp = _PyInterpreterState_GET();
603-
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
604-
return -1;
608+
assert(interp->codecs.initialized);
605609
if (!PyCallable_Check(error)) {
606610
PyErr_SetString(PyExc_TypeError, "handler must be callable");
607611
return -1;
608612
}
609-
return PyDict_SetItemString(interp->codec_error_registry,
613+
return PyDict_SetItemString(interp->codecs.error_registry,
610614
name, error);
611615
}
612616

@@ -616,13 +620,12 @@ int PyCodec_RegisterError(const char *name, PyObject *error)
616620
PyObject *PyCodec_LookupError(const char *name)
617621
{
618622
PyInterpreterState *interp = _PyInterpreterState_GET();
619-
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
620-
return NULL;
623+
assert(interp->codecs.initialized);
621624

622625
if (name==NULL)
623626
name = "strict";
624627
PyObject *handler;
625-
if (PyDict_GetItemStringRef(interp->codec_error_registry, name, &handler) < 0) {
628+
if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
626629
return NULL;
627630
}
628631
if (handler == NULL) {
@@ -1375,7 +1378,8 @@ static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
13751378
return PyCodec_SurrogateEscapeErrors(exc);
13761379
}
13771380

1378-
static int _PyCodecRegistry_Init(void)
1381+
PyStatus
1382+
_PyCodec_InitRegistry(PyInterpreterState *interp)
13791383
{
13801384
static struct {
13811385
const char *name;
@@ -1463,45 +1467,51 @@ static int _PyCodecRegistry_Init(void)
14631467
}
14641468
};
14651469

1466-
PyInterpreterState *interp = _PyInterpreterState_GET();
1467-
PyObject *mod;
1468-
1469-
if (interp->codec_search_path != NULL)
1470-
return 0;
1471-
1472-
interp->codec_search_path = PyList_New(0);
1473-
if (interp->codec_search_path == NULL) {
1474-
return -1;
1470+
assert(interp->codecs.initialized == 0);
1471+
interp->codecs.search_path = PyList_New(0);
1472+
if (interp->codecs.search_path == NULL) {
1473+
return PyStatus_NoMemory();
14751474
}
1476-
1477-
interp->codec_search_cache = PyDict_New();
1478-
if (interp->codec_search_cache == NULL) {
1479-
return -1;
1475+
interp->codecs.search_cache = PyDict_New();
1476+
if (interp->codecs.search_cache == NULL) {
1477+
return PyStatus_NoMemory();
14801478
}
1481-
1482-
interp->codec_error_registry = PyDict_New();
1483-
if (interp->codec_error_registry == NULL) {
1484-
return -1;
1479+
interp->codecs.error_registry = PyDict_New();
1480+
if (interp->codecs.error_registry == NULL) {
1481+
return PyStatus_NoMemory();
14851482
}
1486-
14871483
for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
14881484
PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1489-
if (!func) {
1490-
return -1;
1485+
if (func == NULL) {
1486+
return PyStatus_NoMemory();
14911487
}
14921488

1493-
int res = PyCodec_RegisterError(methods[i].name, func);
1489+
int res = PyDict_SetItemString(interp->codecs.error_registry,
1490+
methods[i].name, func);
14941491
Py_DECREF(func);
1495-
if (res) {
1496-
return -1;
1492+
if (res < 0) {
1493+
return PyStatus_Error("Failed to insert into codec error registry");
14971494
}
14981495
}
14991496

1500-
mod = PyImport_ImportModule("encodings");
1497+
interp->codecs.initialized = 1;
1498+
1499+
// Importing `encodings' will call back into this module to register codec
1500+
// search functions, so this is done after everything else is initialized.
1501+
PyObject *mod = PyImport_ImportModule("encodings");
15011502
if (mod == NULL) {
1502-
return -1;
1503+
return PyStatus_Error("Failed to import encodings module");
15031504
}
15041505
Py_DECREF(mod);
1505-
interp->codecs_initialized = 1;
1506-
return 0;
1506+
1507+
return PyStatus_Ok();
1508+
}
1509+
1510+
void
1511+
_PyCodec_Fini(PyInterpreterState *interp)
1512+
{
1513+
Py_CLEAR(interp->codecs.search_path);
1514+
Py_CLEAR(interp->codecs.search_cache);
1515+
Py_CLEAR(interp->codecs.error_registry);
1516+
interp->codecs.initialized = 0;
15071517
}

Python/pystate.c

+1-3
Original file line numberDiff line numberDiff line change
@@ -843,9 +843,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
843843
}
844844

845845
PyConfig_Clear(&interp->config);
846-
Py_CLEAR(interp->codec_search_path);
847-
Py_CLEAR(interp->codec_search_cache);
848-
Py_CLEAR(interp->codec_error_registry);
846+
_PyCodec_Fini(interp);
849847

850848
assert(interp->imports.modules == NULL);
851849
assert(interp->imports.modules_by_index == NULL);

Tools/c-analyzer/cpython/ignored.tsv

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ Python/ceval.c - _PyEval_BinaryOps -
344344
Python/ceval.c - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS -
345345
Python/codecs.c - Py_hexdigits -
346346
Python/codecs.c - ucnhash_capi -
347-
Python/codecs.c _PyCodecRegistry_Init methods -
347+
Python/codecs.c _PyCodec_InitRegistry methods -
348348
Python/compile.c - NO_LABEL -
349349
Python/compile.c - NO_LOCATION -
350350
Python/dynload_shlib.c - _PyImport_DynLoadFiletab -

0 commit comments

Comments
 (0)