python · ericsnowcurrently · May 7, 2024 · Feb 8, 2024 · Apr 16, 2024 · Apr 16, 2024
@@ -3014,20 +3014,20 @@ def test_basic_multiple_interpreters_deleted_no_reset(self):
         #  * alive in 0 interpreters
         #  * module def in _PyRuntime.imports.extensions
         #  * mod init func ran for the first time (since reset)
-        #  * m_copy is NULL (claered when the interpreter was destroyed)
+        #  * m_copy is still set (owned by main interpreter)
         #  * module's global state was initialized, not reset
 
         # Use a subinterpreter that sticks around.
         loaded_interp1 = self.import_in_subinterp(interpid1)
         self.check_common(loaded_interp1)
-        self.check_semi_fresh(loaded_interp1, loaded_main, base)
+        self.check_copied(loaded_interp1, base)
 
         # At this point:
         #  * alive in 1 interpreter (interp1)
         #  * module def still in _PyRuntime.imports.extensions
-        #  * mod init func ran for the second time (since reset)
-        #  * m_copy was copied from interp1 (was NULL)
-        #  * module's global state was updated, not reset
+        #  * mod init func did not run again
+        #  * m_copy was not changed
+        #  * module's global state was not touched
 
         # Use a subinterpreter while the previous one is still alive.
         loaded_interp2 = self.import_in_subinterp(interpid2)
@@ -3038,8 +3038,8 @@ def test_basic_multiple_interpreters_deleted_no_reset(self):
         #  * alive in 2 interpreters (interp1, interp2)
         #  * module def still in _PyRuntime.imports.extensions
         #  * mod init func did not run again
-        #  * m_copy was copied from interp2 (was from interp1)
-        #  * module's global state was updated, not reset
+        #  * m_copy was not changed
+        #  * module's global state was not touched
 
     @requires_subinterpreters
     def test_basic_multiple_interpreters_reset_each(self):

diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-06-10-57-54.gh-issue-117953.DqCzIs.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-06-10-57-54.gh-issue-117953.DqCzIs.rst
@@ -0,0 +1,8 @@
+When a builtin or extension module is imported for the first time, while a
+subinterpreter is active, the module's init function is now run by the main
+interpreter first before import continues in the subinterpreter.
+Consequently, single-phase init modules now fail in an isolated
+subinterpreter without the init function running under that interpreter,
+whereas before it would run under the subinterpreter *before* failing,
+potentially leaving behind global state and callbacks and otherwise leaving
+the module in an inconsistent state.
@@ -1526,6 +1526,26 @@ _PyImport_CheckSubinterpIncompatibleExtensionAllowed(const char *name)
     return 0;
 }
 
+static PyThreadState *
+switch_to_main_interpreter(PyThreadState *tstate)
+{
+    if (_Py_IsMainInterpreter(tstate->interp)) {
+        return tstate;
+    }
+    PyThreadState *main_tstate = PyThreadState_New(_PyInterpreterState_Main());
+    if (main_tstate == NULL) {
+        return NULL;
+    }
+    main_tstate->_whence = _PyThreadState_WHENCE_EXEC;
+#ifndef NDEBUG
+    PyThreadState *old_tstate = PyThreadState_Swap(main_tstate);
+    assert(old_tstate == tstate);
+#else
+    (void)PyThreadState_Swap(main_tstate);
+#endif
+    return main_tstate;
+}
+
 static PyObject *
 get_core_module_dict(PyInterpreterState *interp,
                      PyObject *name, PyObject *path)
@@ -1874,38 +1894,82 @@ import_run_extension(PyThreadState *tstate, PyModInitFunction p0,
     struct extensions_cache_value *cached = NULL;
     const char *name_buf = PyBytes_AS_STRING(info->name_encoded);
 
+    /* We cannot know if the module is single-phase init or
+     * multi-phase init until after we call its init function. Even
+     * in isolated interpreters (that do not support single-phase init),
+     * the init function will run without restriction.  For multi-phase
+     * init modules that isn't a problem because the init function only
+     * runs PyModuleDef_Init() on the module's def and then returns it.
+     *
+     * However, for single-phase init the module's init function will
+     * create the module, create other objects (and allocate other
+     * memory), populate it and its module state, and initialze static
+     * types.  Some modules store other objects and data in global C
+     * variables and register callbacks with the runtime/stdlib or
+     * even external libraries (which is part of why we can't just
+     * dlclose() the module in the error case).  That's a problem
+     * for isolated interpreters since all of the above happens
+     * and only then * will the import fail.  Memory will leak,
+     * callbacks will still get used, and sometimes there
+     * will be crashes (memory access violations
+     * and use-after-free).
+     *
+     * To put it another way, if the module is single-phase init
+     * then the import will probably break interpreter isolation
+     * and should fail ASAP.  However, the module's init function
+     * will still get run.  That means it may still store state
+     * in the shared-object/DLL address space (which never gets
+     * closed/cleared), including objects (e.g. static types).
+     * This is a problem for isolated subinterpreters since each
+     * has its own object allocator.  If the loaded shared-object
+     * still holds a reference to an object after the corresponding
+     * interpreter has finalized then either we must let it leak
+     * or else any later use of that object by another interpreter
+     * (or across multiple init-fini cycles) will crash the process.
+     *
+     * To avoid all of that, we make sure the module's init function
+     * is always run first with the main interpreter active.  If it was
+     * already the main interpreter then we can continue loading the
+     * module like normal.  Otherwise, right after the init function,
+     * we take care of some import state bookkeeping, switch back
+     * to the subinterpreter, check for single-phase init,
+     * and then continue loading like normal. */
+
+    PyThreadState *main_tstate = NULL;
+    if (!_Py_IsMainInterpreter(tstate->interp)) {
+        /* We *could* leave in place a legacy interpreter here
+         * (one that shares obmalloc/GIL with main interp),
+         * but there isn't a big advantage, we anticipate
+         * such interpreters will be increasingly uncommon,
+         * and the code is a bit simpler if we always switch
+         * to the main interpreter. */
+        main_tstate = switch_to_main_interpreter(tstate);
+        if (main_tstate == NULL) {
+            return NULL;
+        }
+        assert(main_tstate != tstate);
+        // XXX Get import lock.
+    }
+
     struct _Py_ext_module_loader_result res;
-    if (_PyImport_RunModInitFunc(p0, info, &res) < 0) {
+    int rc = _PyImport_RunModInitFunc(p0, info, &res);
+    if (rc < 0) {
         /* We discard res.def. */
         assert(res.module == NULL);
-        _Py_ext_module_loader_result_apply_error(&res, name_buf);
-        return NULL;
-    }
-    assert(!PyErr_Occurred());
-    assert(res.err == NULL);
-
-    mod = res.module;
-    res.module = NULL;
-    def = res.def;
-    assert(def != NULL);
-
-    if (res.kind == _Py_ext_module_kind_MULTIPHASE) {
-        assert_multiphase_def(def);
-        assert(mod == NULL);
-        mod = PyModule_FromDefAndSpec(def, spec);
-        if (mod == NULL) {
-            goto error;
-        }
     }
     else {
-        assert(res.kind == _Py_ext_module_kind_SINGLEPHASE);
-        assert_singlephase_def(def);
-        assert(PyModule_Check(mod));
+        assert(!PyErr_Occurred());
+        assert(res.err == NULL);
 
-        if (_PyImport_CheckSubinterpIncompatibleExtensionAllowed(name_buf) < 0) {
-            goto error;
-        }
+        mod = res.module;
+        res.module = NULL;
+        def = res.def;
+        assert(def != NULL);
+    }
 
+    /* Do anything else that should be done
+     * while still using the main interpreter. */
+    if (res.kind == _Py_ext_module_kind_SINGLEPHASE) {
         /* Remember the filename as the __file__ attribute */
         if (info->filename != NULL) {
             if (PyModule_AddObjectRef(mod, "__file__", info->filename) < 0) {
@@ -1941,15 +2005,73 @@ import_run_extension(PyThreadState *tstate, PyModInitFunction p0,
         if (cached == NULL) {
             goto error;
         }
+    }
+
+    /* Switch back to the subinterpreter. */
+    if (main_tstate != NULL) {
+        assert(main_tstate != tstate);
+
+        /* Any module we got from the init function will have to be
+         * reloaded in the subinterpreter. */
+        Py_CLEAR(mod);
+
+        (void)PyThreadState_Swap(tstate);
+    }
+
+    /*****************************************************************/
+    /* At this point we are back to the interpreter we started with. */
+    /*****************************************************************/
+
+    /* Finally we handle the error return from _PyImport_RunModInitFunc(). */
+    if (rc < 0) {
+        _Py_ext_module_loader_result_apply_error(&res, name_buf);
+        goto error;
+    }
 
-        /* Update per-interpreter import state. */
-        PyObject *modules = get_modules_dict(tstate, true);
-        if (finish_singlephase_extension(
-                tstate, mod, cached, info->name, modules) < 0)
-        {
+    if (res.kind == _Py_ext_module_kind_MULTIPHASE) {
+        assert_multiphase_def(def);
+        assert(mod == NULL);
+        /* Note that we cheat a little by not repeating the calls
+         * to _PyImport_GetModInitFunc() and _PyImport_RunModInitFunc(). */
+        mod = PyModule_FromDefAndSpec(def, spec);
+        if (mod == NULL) {
             goto error;
         }
     }
+    else {
+        assert(res.kind == _Py_ext_module_kind_SINGLEPHASE);
+        assert_singlephase_def(def);
+
+        if (_PyImport_CheckSubinterpIncompatibleExtensionAllowed(name_buf) < 0) {
+            goto error;
+        }
+        assert(!PyErr_Occurred());
+
+        if (main_tstate != NULL) {
+            /* We switched to the main interpreter to run the init
+             * function, so now we will "reload" the module from the
+             * cached data using the original subinterpreter. */
+            assert(mod == NULL);
+            mod = reload_singlephase_extension(tstate, cached, info);
+            if (mod == NULL) {
+                goto error;
+            }
+            assert(!PyErr_Occurred());
+            assert(PyModule_Check(mod));
+        }
+        else {
+            assert(mod != NULL);
+            assert(PyModule_Check(mod));
+
+            /* Update per-interpreter import state. */
+            PyObject *modules = get_modules_dict(tstate, true);
+            if (finish_singlephase_extension(
+                    tstate, mod, cached, info->name, modules) < 0)
+            {
+                goto error;
+            }
+        }
+    }
 
     _Py_ext_module_loader_result_clear(&res);
     return mod;