python · nascheme · Sep 30, 2024 · Sep 30, 2024 · Oct 1, 2024 · ericsnowcurrently
@@ -65,6 +65,7 @@ struct _Py_static_objects {
 
 struct _Py_interp_cached_objects {
     PyObject *interned_strings;
+    PyObject *interned_strings_legacy;
 
     /* AST */
     PyObject *str_replace_inf;

diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
@@ -1076,21 +1076,33 @@ def test_getallocatedblocks(self):
             # about the underlying implementation: the function might
             # return 0 or something greater.
             self.assertGreaterEqual(a, 0)
-        try:
-            # While we could imagine a Python session where the number of
-            # multiple buffer objects would exceed the sharing of references,
-            # it is unlikely to happen in a normal test run.
-            self.assertLess(a, sys.gettotalrefcount())
-        except AttributeError:
-            # gettotalrefcount() not available
-            pass
         gc.collect()
         b = sys.getallocatedblocks()
         self.assertLessEqual(b, a)
         gc.collect()
         c = sys.getallocatedblocks()
         self.assertIn(c, range(b - 50, b + 50))
 
+    @unittest.skipUnless(hasattr(sys, "getallocatedblocks"),
+                         "sys.getallocatedblocks unavailable on this build")
+    def test_getallocatedblocks_refcount(self):
+        # While we could imagine a Python session where the number of multiple
+        # buffer objects would exceed the sharing of references, it is unlikely
+        # to happen given that we run this in a subinterpreter.
+        code = """if 1:
+        import sys
+        num_blocks = sys.getallocatedblocks()
+        try:
+            total_refcnt = sys.gettotalrefcount()
+        except AttributeError:
+            pass
+        else:
+            if num_blocks > total_refcnt:
+                raise AssertionError('allocated blocks exceeds total refcnt')
+        """
+        self.assertEqual(support.run_in_subinterp(code), 0,
+                         'subinterp code failure, check stderr.')
+
     def test_is_gil_enabled(self):
         if support.Py_GIL_DISABLED:
             self.assertIs(type(sys._is_gil_enabled()), bool)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -291,12 +291,18 @@ hashtable_unicode_compare(const void *key1, const void *key2)
    It's not safe to deallocate those strings until all interpreters that
    potentially use them are freed.  By storing them in the main interpreter, we
    ensure they get freed after all other interpreters are freed.
+
+   Subtle detail: it's only required to share the interned string dict in the
+   case that those kinds of legacy modules are actually imported.  However, we
+   can't wait until the import happens so we share if those kind of modules are
+   allowed (the Py_RTFLAGS_MULTI_INTERP_EXTENSIONS flag is set).
 */
 static bool
 has_shared_intern_dict(PyInterpreterState *interp)
 {
     PyInterpreterState *main_interp = _PyInterpreterState_Main();
-    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
+    return (interp != main_interp  &&
+            !(interp->feature_flags & Py_RTFLAGS_MULTI_INTERP_EXTENSIONS));
 }
 
 static int
@@ -305,8 +311,20 @@ init_interned_dict(PyInterpreterState *interp)
     assert(get_interned_dict(interp) == NULL);
     PyObject *interned;
     if (has_shared_intern_dict(interp)) {
-        interned = get_interned_dict(_PyInterpreterState_Main());
-        Py_INCREF(interned);
+        PyInterpreterState *main = _PyInterpreterState_Main();
+        interned = _Py_INTERP_CACHED_OBJECT(main, interned_strings_legacy);
+        if (interned == NULL) {
+            // allocate for main interpreter. We share obmalloc in this case
+            // and we use a separate dict because it's cleaner to ensure these
+            // objects don't show up in the main interpreter (which they could
+            // if we use interned_strings).  They will be shared by all
+            // subinterpreters that allow legacy single-phase init modules.
+            interned = PyDict_New();
+            if (interned == NULL) {
+                return -1;
+            }
+            _Py_INTERP_CACHED_OBJECT(main, interned_strings_legacy) = interned;
+        }
     }
     else {
         interned = PyDict_New();
@@ -318,6 +336,60 @@ init_interned_dict(PyInterpreterState *interp)
     return 0;
 }
 
+/* Clean the dict of interned strings that is used by subinterpreters that
+ * allow basic single-phase extensions modules (has_shared_intern_dict() is
+ * true).  For those, they all share the interned_strings_legacy dict that's
+ * owned by the main interpreter.  Only the main interpreter does cleanup on
+ * it.  See GH-116510.
+ */
+static void
+clear_interned_dict_legacy(PyInterpreterState *interp)
+{
+    PyObject *interned = _Py_INTERP_CACHED_OBJECT(interp,
+                                                  interned_strings_legacy);
+    if (interned == NULL) {
+        return;
+    }
+    // This is similar but slightly different logic compared with
+    // _PyUnicode_ClearInterned().  These are strings created by
+    // subinterpreters but stored in a dict owned by the main interpreter.
+    // Immortalization loses the true reference count and so we need to ensure
+    // all those subinterpreters have exited before cleaning these strings up.
+    Py_ssize_t pos = 0;
+    PyObject *s, *ignored_value;
+    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
+#ifdef Py_TRACE_REFS
+        _Py_AddToAllObjects(s);
+#endif
+        switch (PyUnicode_CHECK_INTERNED(s)) {
+        case SSTATE_INTERNED_IMMORTAL:
+        case SSTATE_INTERNED_IMMORTAL_STATIC:
+            _Py_SetMortal(s, 2);
+#ifdef Py_REF_DEBUG
+            /* let's be pedantic with the ref total */
+            _Py_IncRefTotal(_PyThreadState_GET());
+            _Py_IncRefTotal(_PyThreadState_GET());
+#endif
+            break;
+        case SSTATE_INTERNED_MORTAL:
+            Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
+#ifdef Py_REF_DEBUG
+            /* let's be pedantic with the ref total */
+            _Py_IncRefTotal(_PyThreadState_GET());
+            _Py_IncRefTotal(_PyThreadState_GET());
+#endif
+            break;
+        case SSTATE_NOT_INTERNED:
+            _Py_FALLTHROUGH;
+        default:
+            Py_UNREACHABLE();
+        }
+        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
+    }
+    PyDict_Clear(interned);
+    _Py_INTERP_CACHED_OBJECT(interp, interned_strings_legacy) = NULL;
+}
+
 static void
 clear_interned_dict(PyInterpreterState *interp)
 {
@@ -326,8 +398,9 @@ clear_interned_dict(PyInterpreterState *interp)
         if (!has_shared_intern_dict(interp)) {
             // only clear if the dict belongs to this interpreter
             PyDict_Clear(interned);
+            Py_DECREF(interned);
+            clear_interned_dict_legacy(interp);
         }
-        Py_DECREF(interned);
         _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
     }
 }