diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h
index 8bc681b1a93f5c..79c1023baa9a0f 100644
--- a/Include/cpython/initconfig.h
+++ b/Include/cpython/initconfig.h
@@ -245,6 +245,8 @@ PyAPI_FUNC(PyStatus) PyConfig_SetWideStringList(PyConfig *config,
 /* --- PyInterpreterConfig ------------------------------------ */
 
 typedef struct {
+    // XXX "allow_object_sharing"?  "own_objects"?
+    int use_main_obmalloc;
     int allow_fork;
     int allow_exec;
     int allow_threads;
@@ -254,6 +256,7 @@ typedef struct {
 
 #define _PyInterpreterConfig_INIT \
     { \
+        .use_main_obmalloc = 0, \
         .allow_fork = 0, \
         .allow_exec = 0, \
         .allow_threads = 1, \
@@ -263,6 +266,7 @@ typedef struct {
 
 #define _PyInterpreterConfig_LEGACY_INIT \
     { \
+        .use_main_obmalloc = 1, \
         .allow_fork = 1, \
         .allow_exec = 1, \
         .allow_threads = 1, \
diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index ea6ed8d2bc4a4c..f33c72d4cf4d2a 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -11,6 +11,10 @@ is available in a given context.  For example, forking the process
 might not be allowed in the current interpreter (i.e. os.fork() would fail).
 */
 
+/* Set if the interpreter share obmalloc runtime state
+   with the main interpreter. */
+#define Py_RTFLAGS_USE_MAIN_OBMALLOC (1UL << 5)
+
 /* Set if import should check a module for subinterpreter support. */
 #define Py_RTFLAGS_MULTI_INTERP_EXTENSIONS (1UL << 8)
 
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 86ae3d8dfc1860..7276ce35ba68f0 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -23,11 +23,12 @@ extern "C" {
 #include "pycore_function.h"      // FUNC_MAX_WATCHERS
 #include "pycore_genobject.h"     // struct _Py_async_gen_state
 #include "pycore_gc.h"            // struct _gc_runtime_state
+#include "pycore_global_objects.h"  // struct _Py_interp_static_objects
 #include "pycore_import.h"        // struct _import_state
 #include "pycore_instruments.h"   // PY_MONITORING_EVENTS
 #include "pycore_list.h"          // struct _Py_list_state
-#include "pycore_global_objects.h"  // struct _Py_interp_static_objects
 #include "pycore_object_state.h"   // struct _py_object_state
+#include "pycore_obmalloc.h"      // struct obmalloc_state
 #include "pycore_tuple.h"         // struct _Py_tuple_state
 #include "pycore_typeobject.h"    // struct type_cache
 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
@@ -82,6 +83,8 @@ struct _is {
     int _initialized;
     int finalizing;
 
+    struct _obmalloc_state obmalloc;
+
     struct _ceval_state ceval;
     struct _gc_runtime_state gc;
 
diff --git a/Include/internal/pycore_obmalloc.h b/Include/internal/pycore_obmalloc.h
index a5c7f4528f9126..ca2a0419b4f038 100644
--- a/Include/internal/pycore_obmalloc.h
+++ b/Include/internal/pycore_obmalloc.h
@@ -657,8 +657,12 @@ struct _obmalloc_usage {
 #endif /* WITH_PYMALLOC_RADIX_TREE */
 
 
-struct _obmalloc_state {
+struct _obmalloc_global_state {
     int dump_debug_stats;
+    Py_ssize_t interpreter_leaks;
+};
+
+struct _obmalloc_state {
     struct _obmalloc_pools pools;
     struct _obmalloc_mgmt mgmt;
     struct _obmalloc_usage usage;
@@ -675,7 +679,11 @@ void _PyObject_VirtualFree(void *, size_t size);
 
 
 /* This function returns the number of allocated memory blocks, regardless of size */
-PyAPI_FUNC(Py_ssize_t) _Py_GetAllocatedBlocks(void);
+extern Py_ssize_t _Py_GetGlobalAllocatedBlocks(void);
+#define _Py_GetAllocatedBlocks() \
+    _Py_GetGlobalAllocatedBlocks()
+extern Py_ssize_t _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *);
+extern void _PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *);
 
 
 #ifdef WITH_PYMALLOC
diff --git a/Include/internal/pycore_obmalloc_init.h b/Include/internal/pycore_obmalloc_init.h
index c9f197e72de9f5..8ee72ff2d4126f 100644
--- a/Include/internal/pycore_obmalloc_init.h
+++ b/Include/internal/pycore_obmalloc_init.h
@@ -54,9 +54,13 @@ extern "C" {
 #  error "NB_SMALL_SIZE_CLASSES should be less than 64"
 #endif
 
-#define _obmalloc_state_INIT(obmalloc) \
+#define _obmalloc_global_state_INIT \
     { \
         .dump_debug_stats = -1, \
+    }
+
+#define _obmalloc_state_INIT(obmalloc) \
+    { \
         .pools = { \
             .used = _obmalloc_pools_INIT(obmalloc.pools), \
         }, \
diff --git a/Include/internal/pycore_pylifecycle.h b/Include/internal/pycore_pylifecycle.h
index a899e848bb8b3c..f96261a650dac7 100644
--- a/Include/internal/pycore_pylifecycle.h
+++ b/Include/internal/pycore_pylifecycle.h
@@ -64,6 +64,7 @@ extern void _PyAtExit_Fini(PyInterpreterState *interp);
 extern void _PyThread_FiniType(PyInterpreterState *interp);
 extern void _Py_Deepfreeze_Fini(void);
 extern void _PyArg_Fini(void);
+extern void _Py_FinalizeAllocatedBlocks(_PyRuntimeState *);
 
 extern PyStatus _PyGILState_Init(PyInterpreterState *interp);
 extern PyStatus _PyGILState_SetTstate(PyThreadState *tstate);
diff --git a/Include/internal/pycore_pystate.h b/Include/internal/pycore_pystate.h
index c40f9e7393a16f..180ea676bc22eb 100644
--- a/Include/internal/pycore_pystate.h
+++ b/Include/internal/pycore_pystate.h
@@ -33,6 +33,13 @@ _Py_IsMainInterpreter(PyInterpreterState *interp)
     return (interp == _PyInterpreterState_Main());
 }
 
+static inline int
+_Py_IsMainInterpreterFinalizing(PyInterpreterState *interp)
+{
+    return (_PyRuntimeState_GetFinalizing(interp->runtime) != NULL &&
+            interp == &interp->runtime->_main_interpreter);
+}
+
 
 static inline const PyConfig *
 _Py_GetMainConfig(void)
diff --git a/Include/internal/pycore_runtime.h b/Include/internal/pycore_runtime.h
index 2a3fd8ab2813ea..0ee06631ac0672 100644
--- a/Include/internal/pycore_runtime.h
+++ b/Include/internal/pycore_runtime.h
@@ -21,7 +21,6 @@ extern "C" {
 #include "pycore_pymem.h"           // struct _pymem_allocators
 #include "pycore_pyhash.h"          // struct pyhash_runtime_state
 #include "pycore_pythread.h"        // struct _pythread_runtime_state
-#include "pycore_obmalloc.h"        // struct obmalloc_state
 #include "pycore_signal.h"          // struct _signals_runtime_state
 #include "pycore_time.h"            // struct _time_runtime_state
 #include "pycore_tracemalloc.h"     // struct _tracemalloc_runtime_state
@@ -87,7 +86,7 @@ typedef struct pyruntimestate {
     _Py_atomic_address _finalizing;
 
     struct _pymem_allocators allocators;
-    struct _obmalloc_state obmalloc;
+    struct _obmalloc_global_state obmalloc;
     struct pyhash_runtime_state pyhash_state;
     struct _time_runtime_state time;
     struct _pythread_runtime_state threads;
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
index d8425b3199a89a..0ac235fc5c2496 100644
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@@ -29,7 +29,7 @@ extern PyTypeObject _PyExc_MemoryError;
             _pymem_allocators_debug_INIT, \
             _pymem_allocators_obj_arena_INIT, \
         }, \
-        .obmalloc = _obmalloc_state_INIT(runtime.obmalloc), \
+        .obmalloc = _obmalloc_global_state_INIT, \
         .pyhash_state = pyhash_state_INIT, \
         .signals = _signals_RUNTIME_INIT, \
         .interpreters = { \
@@ -93,6 +93,7 @@ extern PyTypeObject _PyExc_MemoryError;
     { \
         .id_refcount = -1, \
         .imports = IMPORTS_INIT, \
+        .obmalloc = _obmalloc_state_INIT(INTERP.obmalloc), \
         .ceval = { \
             .recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \
         }, \
diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py
index 637adc01a331ce..eab693001961a8 100644
--- a/Lib/test/test_capi/test_misc.py
+++ b/Lib/test/test_capi/test_misc.py
@@ -1211,20 +1211,25 @@ def test_configured_settings(self):
         """
         import json
 
+        OBMALLOC = 1<<5
         EXTENSIONS = 1<<8
         THREADS = 1<<10
         DAEMON_THREADS = 1<<11
         FORK = 1<<15
         EXEC = 1<<16
 
-        features = ['fork', 'exec', 'threads', 'daemon_threads', 'extensions']
+        features = ['obmalloc', 'fork', 'exec', 'threads', 'daemon_threads',
+                    'extensions']
         kwlist = [f'allow_{n}' for n in features]
+        kwlist[0] = 'use_main_obmalloc'
         kwlist[-1] = 'check_multi_interp_extensions'
+
+        # expected to work
         for config, expected in {
-            (True, True, True, True, True):
-                FORK | EXEC | THREADS | DAEMON_THREADS | EXTENSIONS,
-            (False, False, False, False, False): 0,
-            (False, False, True, False, True): THREADS | EXTENSIONS,
+            (True, True, True, True, True, True):
+                OBMALLOC | FORK | EXEC | THREADS | DAEMON_THREADS | EXTENSIONS,
+            (True, False, False, False, False, False): OBMALLOC,
+            (False, False, False, True, False, True): THREADS | EXTENSIONS,
         }.items():
             kwargs = dict(zip(kwlist, config))
             expected = {
@@ -1246,6 +1251,20 @@ def test_configured_settings(self):
 
                 self.assertEqual(settings, expected)
 
+        # expected to fail
+        for config in [
+            (False, False, False, False, False, False),
+        ]:
+            kwargs = dict(zip(kwlist, config))
+            with self.subTest(config):
+                script = textwrap.dedent(f'''
+                    import _testinternalcapi
+                    _testinternalcapi.get_interp_settings()
+                    raise NotImplementedError('unreachable')
+                    ''')
+                with self.assertRaises(RuntimeError):
+                    support.run_in_subinterp_with_config(script, **kwargs)
+
     @unittest.skipIf(_testsinglephase is None, "test requires _testsinglephase module")
     @unittest.skipUnless(hasattr(os, "pipe"), "requires os.pipe()")
     def test_overridden_setting_extensions_subinterp_check(self):
@@ -1257,13 +1276,15 @@ def test_overridden_setting_extensions_subinterp_check(self):
         """
         import json
 
+        OBMALLOC = 1<<5
         EXTENSIONS = 1<<8
         THREADS = 1<<10
         DAEMON_THREADS = 1<<11
         FORK = 1<<15
         EXEC = 1<<16
-        BASE_FLAGS = FORK | EXEC | THREADS | DAEMON_THREADS
+        BASE_FLAGS = OBMALLOC | FORK | EXEC | THREADS | DAEMON_THREADS
         base_kwargs = {
+            'use_main_obmalloc': True,
             'allow_fork': True,
             'allow_exec': True,
             'allow_threads': True,
diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index e56d0db8627e91..f702ffb99905a5 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -1656,6 +1656,7 @@ def test_init_use_frozen_modules(self):
                                        api=API_PYTHON, env=env)
 
     def test_init_main_interpreter_settings(self):
+        OBMALLOC = 1<<5
         EXTENSIONS = 1<<8
         THREADS = 1<<10
         DAEMON_THREADS = 1<<11
@@ -1664,7 +1665,7 @@ def test_init_main_interpreter_settings(self):
         expected = {
             # All optional features should be enabled.
             'feature_flags':
-                FORK | EXEC | THREADS | DAEMON_THREADS,
+                OBMALLOC | FORK | EXEC | THREADS | DAEMON_THREADS,
         }
         out, err = self.run_embedded_interpreter(
             'test_init_main_interpreter_settings',
diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py
index 66ae554f984fa2..f206e52faf8c5c 100644
--- a/Lib/test/test_import/__init__.py
+++ b/Lib/test/test_import/__init__.py
@@ -1636,7 +1636,12 @@ class SubinterpImportTests(unittest.TestCase):
         allow_exec=False,
         allow_threads=True,
         allow_daemon_threads=False,
+        # Isolation-related config values aren't included here.
     )
+    ISOLATED = dict(
+        use_main_obmalloc=False,
+    )
+    NOT_ISOLATED = {k: not v for k, v in ISOLATED.items()}
 
     @unittest.skipUnless(hasattr(os, "pipe"), "requires os.pipe()")
     def pipe(self):
@@ -1669,6 +1674,7 @@ def import_script(self, name, fd, check_override=None):
     def run_here(self, name, *,
                  check_singlephase_setting=False,
                  check_singlephase_override=None,
+                 isolated=False,
                  ):
         """
         Try importing the named module in a subinterpreter.
@@ -1689,6 +1695,7 @@ def run_here(self, name, *,
 
         kwargs = dict(
             **self.RUN_KWARGS,
+            **(self.ISOLATED if isolated else self.NOT_ISOLATED),
             check_multi_interp_extensions=check_singlephase_setting,
         )
 
@@ -1699,33 +1706,36 @@ def run_here(self, name, *,
         self.assertEqual(ret, 0)
         return os.read(r, 100)
 
-    def check_compatible_here(self, name, *, strict=False):
+    def check_compatible_here(self, name, *, strict=False, isolated=False):
         # Verify that the named module may be imported in a subinterpreter.
         # (See run_here() for more info.)
         out = self.run_here(name,
                             check_singlephase_setting=strict,
+                            isolated=isolated,
                             )
         self.assertEqual(out, b'okay')
 
-    def check_incompatible_here(self, name):
+    def check_incompatible_here(self, name, *, isolated=False):
         # Differences from check_compatible_here():
         #  * verify that import fails
         #  * "strict" is always True
         out = self.run_here(name,
                             check_singlephase_setting=True,
+                            isolated=isolated,
                             )
         self.assertEqual(
             out.decode('utf-8'),
             f'ImportError: module {name} does not support loading in subinterpreters',
         )
 
-    def check_compatible_fresh(self, name, *, strict=False):
+    def check_compatible_fresh(self, name, *, strict=False, isolated=False):
         # Differences from check_compatible_here():
         #  * subinterpreter in a new process
         #  * module has never been imported before in that process
         #  * this tests importing the module for the first time
         kwargs = dict(
             **self.RUN_KWARGS,
+            **(self.ISOLATED if isolated else self.NOT_ISOLATED),
             check_multi_interp_extensions=strict,
         )
         _, out, err = script_helper.assert_python_ok('-c', textwrap.dedent(f'''
@@ -1743,12 +1753,13 @@ def check_compatible_fresh(self, name, *, strict=False):
         self.assertEqual(err, b'')
         self.assertEqual(out, b'okay')
 
-    def check_incompatible_fresh(self, name):
+    def check_incompatible_fresh(self, name, *, isolated=False):
         # Differences from check_compatible_fresh():
         #  * verify that import fails
         #  * "strict" is always True
         kwargs = dict(
             **self.RUN_KWARGS,
+            **(self.ISOLATED if isolated else self.NOT_ISOLATED),
             check_multi_interp_extensions=True,
         )
         _, out, err = script_helper.assert_python_ok('-c', textwrap.dedent(f'''
@@ -1854,6 +1865,14 @@ def check_incompatible(setting, override):
         with self.subTest('config: check disabled; override: disabled'):
             check_compatible(False, -1)
 
+    def test_isolated_config(self):
+        module = 'threading'
+        require_pure_python(module)
+        with self.subTest(f'{module}: strict, not fresh'):
+            self.check_compatible_here(module, strict=True, isolated=True)
+        with self.subTest(f'{module}: strict, fresh'):
+            self.check_compatible_fresh(module, strict=True, isolated=True)
+
 
 class TestSinglePhaseSnapshot(ModuleSnapshot):
 
diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py
index a39a267b403d83..fdd74c37e26235 100644
--- a/Lib/test/test_threading.py
+++ b/Lib/test/test_threading.py
@@ -1343,6 +1343,7 @@ def func():
             import test.support
             test.support.run_in_subinterp_with_config(
                 {subinterp_code!r},
+                use_main_obmalloc=True,
                 allow_fork=True,
                 allow_exec=True,
                 allow_threads={allowed},
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index 30b2674d543c67..c1892f6fa0a4b8 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -1482,6 +1482,7 @@ static PyObject *
 run_in_subinterp_with_config(PyObject *self, PyObject *args, PyObject *kwargs)
 {
     const char *code;
+    int use_main_obmalloc = -1;
     int allow_fork = -1;
     int allow_exec = -1;
     int allow_threads = -1;
@@ -1493,6 +1494,7 @@ run_in_subinterp_with_config(PyObject *self, PyObject *args, PyObject *kwargs)
     PyCompilerFlags cflags = {0};
 
     static char *kwlist[] = {"code",
+                             "use_main_obmalloc",
                              "allow_fork",
                              "allow_exec",
                              "allow_threads",
@@ -1500,12 +1502,17 @@ run_in_subinterp_with_config(PyObject *self, PyObject *args, PyObject *kwargs)
                              "check_multi_interp_extensions",
                              NULL};
     if (!PyArg_ParseTupleAndKeywords(args, kwargs,
-                    "s$ppppp:run_in_subinterp_with_config", kwlist,
-                    &code, &allow_fork, &allow_exec,
+                    "s$pppppp:run_in_subinterp_with_config", kwlist,
+                    &code, &use_main_obmalloc,
+                    &allow_fork, &allow_exec,
                     &allow_threads, &allow_daemon_threads,
                     &check_multi_interp_extensions)) {
         return NULL;
     }
+    if (use_main_obmalloc < 0) {
+        PyErr_SetString(PyExc_ValueError, "missing use_main_obmalloc");
+        return NULL;
+    }
     if (allow_fork < 0) {
         PyErr_SetString(PyExc_ValueError, "missing allow_fork");
         return NULL;
@@ -1532,6 +1539,7 @@ run_in_subinterp_with_config(PyObject *self, PyObject *args, PyObject *kwargs)
     PyThreadState_Swap(NULL);
 
     const _PyInterpreterConfig config = {
+        .use_main_obmalloc = use_main_obmalloc,
         .allow_fork = allow_fork,
         .allow_exec = allow_exec,
         .allow_threads = allow_threads,
diff --git a/Objects/object.c b/Objects/object.c
index a784e6bcbf97f9..65c296e9340601 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -145,7 +145,7 @@ _PyDebug_PrintTotalRefs(void) {
     _PyRuntimeState *runtime = &_PyRuntime;
     fprintf(stderr,
             "[%zd refs, %zd blocks]\n",
-            get_global_reftotal(runtime), _Py_GetAllocatedBlocks());
+            get_global_reftotal(runtime), _Py_GetGlobalAllocatedBlocks());
     /* It may be helpful to also print the "legacy" reftotal separately.
        Likewise for the total for each interpreter. */
 }
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 5e1bcda1d976bb..de62aeb04461fa 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -725,20 +725,51 @@ PyObject_Free(void *ptr)
 static int running_on_valgrind = -1;
 #endif
 
+typedef struct _obmalloc_state OMState;
 
-#define allarenas (_PyRuntime.obmalloc.mgmt.arenas)
-#define maxarenas (_PyRuntime.obmalloc.mgmt.maxarenas)
-#define unused_arena_objects (_PyRuntime.obmalloc.mgmt.unused_arena_objects)
-#define usable_arenas (_PyRuntime.obmalloc.mgmt.usable_arenas)
-#define nfp2lasta (_PyRuntime.obmalloc.mgmt.nfp2lasta)
-#define narenas_currently_allocated (_PyRuntime.obmalloc.mgmt.narenas_currently_allocated)
-#define ntimes_arena_allocated (_PyRuntime.obmalloc.mgmt.ntimes_arena_allocated)
-#define narenas_highwater (_PyRuntime.obmalloc.mgmt.narenas_highwater)
-#define raw_allocated_blocks (_PyRuntime.obmalloc.mgmt.raw_allocated_blocks)
+static inline int
+has_own_state(PyInterpreterState *interp)
+{
+    return (_Py_IsMainInterpreter(interp) ||
+            !(interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC) ||
+            _Py_IsMainInterpreterFinalizing(interp));
+}
+
+static inline OMState *
+get_state(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    if (!has_own_state(interp)) {
+        interp = _PyInterpreterState_Main();
+    }
+    return &interp->obmalloc;
+}
+
+// These macros all rely on a local "state" variable.
+#define usedpools (state->pools.used)
+#define allarenas (state->mgmt.arenas)
+#define maxarenas (state->mgmt.maxarenas)
+#define unused_arena_objects (state->mgmt.unused_arena_objects)
+#define usable_arenas (state->mgmt.usable_arenas)
+#define nfp2lasta (state->mgmt.nfp2lasta)
+#define narenas_currently_allocated (state->mgmt.narenas_currently_allocated)
+#define ntimes_arena_allocated (state->mgmt.ntimes_arena_allocated)
+#define narenas_highwater (state->mgmt.narenas_highwater)
+#define raw_allocated_blocks (state->mgmt.raw_allocated_blocks)
 
 Py_ssize_t
-_Py_GetAllocatedBlocks(void)
+_PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *interp)
 {
+#ifdef Py_DEBUG
+    assert(has_own_state(interp));
+#else
+    if (!has_own_state(interp)) {
+        _Py_FatalErrorFunc(__func__,
+                           "the interpreter doesn't have its own allocator");
+    }
+#endif
+    OMState *state = &interp->obmalloc;
+
     Py_ssize_t n = raw_allocated_blocks;
     /* add up allocated blocks for used pools */
     for (uint i = 0; i < maxarenas; ++i) {
@@ -759,20 +790,100 @@ _Py_GetAllocatedBlocks(void)
     return n;
 }
 
+void
+_PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *interp)
+{
+    if (has_own_state(interp)) {
+        Py_ssize_t leaked = _PyInterpreterState_GetAllocatedBlocks(interp);
+        assert(has_own_state(interp) || leaked == 0);
+        interp->runtime->obmalloc.interpreter_leaks += leaked;
+    }
+}
+
+static Py_ssize_t get_num_global_allocated_blocks(_PyRuntimeState *);
+
+/* We preserve the number of blockss leaked during runtime finalization,
+   so they can be reported if the runtime is initialized again. */
+// XXX We don't lose any information by dropping this,
+// so we should consider doing so.
+static Py_ssize_t last_final_leaks = 0;
+
+void
+_Py_FinalizeAllocatedBlocks(_PyRuntimeState *runtime)
+{
+    last_final_leaks = get_num_global_allocated_blocks(runtime);
+    runtime->obmalloc.interpreter_leaks = 0;
+}
+
+static Py_ssize_t
+get_num_global_allocated_blocks(_PyRuntimeState *runtime)
+{
+    Py_ssize_t total = 0;
+    if (_PyRuntimeState_GetFinalizing(runtime) != NULL) {
+        PyInterpreterState *interp = _PyInterpreterState_Main();
+        if (interp == NULL) {
+            /* We are at the very end of runtime finalization.
+               We can't rely on finalizing->interp since that thread
+               state is probably already freed, so we don't worry
+               about it. */
+            assert(PyInterpreterState_Head() == NULL);
+        }
+        else {
+            assert(interp != NULL);
+            /* It is probably the last interpreter but not necessarily. */
+            assert(PyInterpreterState_Next(interp) == NULL);
+            total += _PyInterpreterState_GetAllocatedBlocks(interp);
+        }
+    }
+    else {
+        HEAD_LOCK(runtime);
+        PyInterpreterState *interp = PyInterpreterState_Head();
+        assert(interp != NULL);
+#ifdef Py_DEBUG
+        int got_main = 0;
+#endif
+        for (; interp != NULL; interp = PyInterpreterState_Next(interp)) {
+#ifdef Py_DEBUG
+            if (_Py_IsMainInterpreter(interp)) {
+                assert(!got_main);
+                got_main = 1;
+                assert(has_own_state(interp));
+            }
+#endif
+            if (has_own_state(interp)) {
+                total += _PyInterpreterState_GetAllocatedBlocks(interp);
+            }
+        }
+        HEAD_UNLOCK(runtime);
+#ifdef Py_DEBUG
+        assert(got_main);
+#endif
+    }
+    total += runtime->obmalloc.interpreter_leaks;
+    total += last_final_leaks;
+    return total;
+}
+
+Py_ssize_t
+_Py_GetGlobalAllocatedBlocks(void)
+{
+    return get_num_global_allocated_blocks(&_PyRuntime);
+}
+
 #if WITH_PYMALLOC_RADIX_TREE
 /*==========================================================================*/
 /* radix tree for tracking arena usage. */
 
-#define arena_map_root (_PyRuntime.obmalloc.usage.arena_map_root)
+#define arena_map_root (state->usage.arena_map_root)
 #ifdef USE_INTERIOR_NODES
-#define arena_map_mid_count (_PyRuntime.obmalloc.usage.arena_map_mid_count)
-#define arena_map_bot_count (_PyRuntime.obmalloc.usage.arena_map_bot_count)
+#define arena_map_mid_count (state->usage.arena_map_mid_count)
+#define arena_map_bot_count (state->usage.arena_map_bot_count)
 #endif
 
 /* Return a pointer to a bottom tree node, return NULL if it doesn't exist or
  * it cannot be created */
 static Py_ALWAYS_INLINE arena_map_bot_t *
-arena_map_get(pymem_block *p, int create)
+arena_map_get(OMState *state, pymem_block *p, int create)
 {
 #ifdef USE_INTERIOR_NODES
     /* sanity check that IGNORE_BITS is correct */
@@ -833,11 +944,12 @@ arena_map_get(pymem_block *p, int create)
 
 /* mark or unmark addresses covered by arena */
 static int
-arena_map_mark_used(uintptr_t arena_base, int is_used)
+arena_map_mark_used(OMState *state, uintptr_t arena_base, int is_used)
 {
     /* sanity check that IGNORE_BITS is correct */
     assert(HIGH_BITS(arena_base) == HIGH_BITS(&arena_map_root));
-    arena_map_bot_t *n_hi = arena_map_get((pymem_block *)arena_base, is_used);
+    arena_map_bot_t *n_hi = arena_map_get(
+            state, (pymem_block *)arena_base, is_used);
     if (n_hi == NULL) {
         assert(is_used); /* otherwise node should already exist */
         return 0; /* failed to allocate space for node */
@@ -862,7 +974,8 @@ arena_map_mark_used(uintptr_t arena_base, int is_used)
          * must overflow to 0.  However, that would mean arena_base was
          * "ideal" and we should not be in this case. */
         assert(arena_base < arena_base_next);
-        arena_map_bot_t *n_lo = arena_map_get((pymem_block *)arena_base_next, is_used);
+        arena_map_bot_t *n_lo = arena_map_get(
+                state, (pymem_block *)arena_base_next, is_used);
         if (n_lo == NULL) {
             assert(is_used); /* otherwise should already exist */
             n_hi->arenas[i3].tail_hi = 0;
@@ -877,9 +990,9 @@ arena_map_mark_used(uintptr_t arena_base, int is_used)
 /* Return true if 'p' is a pointer inside an obmalloc arena.
  * _PyObject_Free() calls this so it needs to be very fast. */
 static int
-arena_map_is_used(pymem_block *p)
+arena_map_is_used(OMState *state, pymem_block *p)
 {
-    arena_map_bot_t *n = arena_map_get(p, 0);
+    arena_map_bot_t *n = arena_map_get(state, p, 0);
     if (n == NULL) {
         return 0;
     }
@@ -902,7 +1015,7 @@ arena_map_is_used(pymem_block *p)
  * `usable_arenas` to the return value.
  */
 static struct arena_object*
-new_arena(void)
+new_arena(OMState *state)
 {
     struct arena_object* arenaobj;
     uint excess;        /* number of bytes above pool alignment */
@@ -968,7 +1081,7 @@ new_arena(void)
     address = _PyObject_Arena.alloc(_PyObject_Arena.ctx, ARENA_SIZE);
 #if WITH_PYMALLOC_RADIX_TREE
     if (address != NULL) {
-        if (!arena_map_mark_used((uintptr_t)address, 1)) {
+        if (!arena_map_mark_used(state, (uintptr_t)address, 1)) {
             /* marking arena in radix tree failed, abort */
             _PyObject_Arena.free(_PyObject_Arena.ctx, address, ARENA_SIZE);
             address = NULL;
@@ -1011,9 +1124,9 @@ new_arena(void)
    pymalloc.  When the radix tree is used, 'poolp' is unused.
  */
 static bool
-address_in_range(void *p, poolp Py_UNUSED(pool))
+address_in_range(OMState *state, void *p, poolp Py_UNUSED(pool))
 {
-    return arena_map_is_used(p);
+    return arena_map_is_used(state, p);
 }
 #else
 /*
@@ -1094,7 +1207,7 @@ extremely desirable that it be this fast.
 static bool _Py_NO_SANITIZE_ADDRESS
             _Py_NO_SANITIZE_THREAD
             _Py_NO_SANITIZE_MEMORY
-address_in_range(void *p, poolp pool)
+address_in_range(OMState *state, void *p, poolp pool)
 {
     // Since address_in_range may be reading from memory which was not allocated
     // by Python, it is important that pool->arenaindex is read only once, as
@@ -1111,8 +1224,6 @@ address_in_range(void *p, poolp pool)
 
 /*==========================================================================*/
 
-#define usedpools (_PyRuntime.obmalloc.pools.used)
-
 // Called when freelist is exhausted.  Extend the freelist if there is
 // space for a block.  Otherwise, remove this pool from usedpools.
 static void
@@ -1138,7 +1249,7 @@ pymalloc_pool_extend(poolp pool, uint size)
  * This function takes new pool and allocate a block from it.
  */
 static void*
-allocate_from_new_pool(uint size)
+allocate_from_new_pool(OMState *state, uint size)
 {
     /* There isn't a pool of the right size class immediately
      * available:  use a free pool.
@@ -1150,7 +1261,7 @@ allocate_from_new_pool(uint size)
             return NULL;
         }
 #endif
-        usable_arenas = new_arena();
+        usable_arenas = new_arena(state);
         if (usable_arenas == NULL) {
             return NULL;
         }
@@ -1274,7 +1385,7 @@ allocate_from_new_pool(uint size)
    or when the max memory limit has been reached.
 */
 static inline void*
-pymalloc_alloc(void *Py_UNUSED(ctx), size_t nbytes)
+pymalloc_alloc(OMState *state, void *Py_UNUSED(ctx), size_t nbytes)
 {
 #ifdef WITH_VALGRIND
     if (UNLIKELY(running_on_valgrind == -1)) {
@@ -1314,7 +1425,7 @@ pymalloc_alloc(void *Py_UNUSED(ctx), size_t nbytes)
         /* There isn't a pool of the right size class immediately
          * available:  use a free pool.
          */
-        bp = allocate_from_new_pool(size);
+        bp = allocate_from_new_pool(state, size);
     }
 
     return (void *)bp;
@@ -1324,7 +1435,8 @@ pymalloc_alloc(void *Py_UNUSED(ctx), size_t nbytes)
 void *
 _PyObject_Malloc(void *ctx, size_t nbytes)
 {
-    void* ptr = pymalloc_alloc(ctx, nbytes);
+    OMState *state = get_state();
+    void* ptr = pymalloc_alloc(state, ctx, nbytes);
     if (LIKELY(ptr != NULL)) {
         return ptr;
     }
@@ -1343,7 +1455,8 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
     assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize);
     size_t nbytes = nelem * elsize;
 
-    void* ptr = pymalloc_alloc(ctx, nbytes);
+    OMState *state = get_state();
+    void* ptr = pymalloc_alloc(state, ctx, nbytes);
     if (LIKELY(ptr != NULL)) {
         memset(ptr, 0, nbytes);
         return ptr;
@@ -1358,7 +1471,7 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
 
 
 static void
-insert_to_usedpool(poolp pool)
+insert_to_usedpool(OMState *state, poolp pool)
 {
     assert(pool->ref.count > 0);            /* else the pool is empty */
 
@@ -1374,7 +1487,7 @@ insert_to_usedpool(poolp pool)
 }
 
 static void
-insert_to_freepool(poolp pool)
+insert_to_freepool(OMState *state, poolp pool)
 {
     poolp next = pool->nextpool;
     poolp prev = pool->prevpool;
@@ -1457,7 +1570,7 @@ insert_to_freepool(poolp pool)
 
 #if WITH_PYMALLOC_RADIX_TREE
         /* mark arena region as not under control of obmalloc */
-        arena_map_mark_used(ao->address, 0);
+        arena_map_mark_used(state, ao->address, 0);
 #endif
 
         /* Free the entire arena. */
@@ -1544,7 +1657,7 @@ insert_to_freepool(poolp pool)
    Return 1 if it was freed.
    Return 0 if the block was not allocated by pymalloc_alloc(). */
 static inline int
-pymalloc_free(void *Py_UNUSED(ctx), void *p)
+pymalloc_free(OMState *state, void *Py_UNUSED(ctx), void *p)
 {
     assert(p != NULL);
 
@@ -1555,7 +1668,7 @@ pymalloc_free(void *Py_UNUSED(ctx), void *p)
 #endif
 
     poolp pool = POOL_ADDR(p);
-    if (UNLIKELY(!address_in_range(p, pool))) {
+    if (UNLIKELY(!address_in_range(state, p, pool))) {
         return 0;
     }
     /* We allocated this address. */
@@ -1579,7 +1692,7 @@ pymalloc_free(void *Py_UNUSED(ctx), void *p)
          * targets optimal filling when several pools contain
          * blocks of the same size class.
          */
-        insert_to_usedpool(pool);
+        insert_to_usedpool(state, pool);
         return 1;
     }
 
@@ -1596,7 +1709,7 @@ pymalloc_free(void *Py_UNUSED(ctx), void *p)
      * previously freed pools will be allocated later
      * (being not referenced, they are perhaps paged out).
      */
-    insert_to_freepool(pool);
+    insert_to_freepool(state, pool);
     return 1;
 }
 
@@ -1609,7 +1722,8 @@ _PyObject_Free(void *ctx, void *p)
         return;
     }
 
-    if (UNLIKELY(!pymalloc_free(ctx, p))) {
+    OMState *state = get_state();
+    if (UNLIKELY(!pymalloc_free(state, ctx, p))) {
         /* pymalloc didn't allocate this address */
         PyMem_RawFree(p);
         raw_allocated_blocks--;
@@ -1627,7 +1741,8 @@ _PyObject_Free(void *ctx, void *p)
 
    Return 0 if pymalloc didn't allocated p. */
 static int
-pymalloc_realloc(void *ctx, void **newptr_p, void *p, size_t nbytes)
+pymalloc_realloc(OMState *state, void *ctx,
+                 void **newptr_p, void *p, size_t nbytes)
 {
     void *bp;
     poolp pool;
@@ -1643,7 +1758,7 @@ pymalloc_realloc(void *ctx, void **newptr_p, void *p, size_t nbytes)
 #endif
 
     pool = POOL_ADDR(p);
-    if (!address_in_range(p, pool)) {
+    if (!address_in_range(state, p, pool)) {
         /* pymalloc is not managing this block.
 
            If nbytes <= SMALL_REQUEST_THRESHOLD, it's tempting to try to take
@@ -1696,7 +1811,8 @@ _PyObject_Realloc(void *ctx, void *ptr, size_t nbytes)
         return _PyObject_Malloc(ctx, nbytes);
     }
 
-    if (pymalloc_realloc(ctx, &ptr2, ptr, nbytes)) {
+    OMState *state = get_state();
+    if (pymalloc_realloc(state, ctx, &ptr2, ptr, nbytes)) {
         return ptr2;
     }
 
@@ -1710,11 +1826,29 @@ _PyObject_Realloc(void *ctx, void *ptr, size_t nbytes)
  * only be used by extensions that are compiled with pymalloc enabled. */
 
 Py_ssize_t
-_Py_GetAllocatedBlocks(void)
+_PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *Py_UNUSED(interp))
+{
+    return 0;
+}
+
+Py_ssize_t
+_Py_GetGlobalAllocatedBlocks(void)
 {
     return 0;
 }
 
+void
+_PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *Py_UNUSED(interp))
+{
+    return;
+}
+
+void
+_Py_FinalizeAllocatedBlocks(_PyRuntimeState *Py_UNUSED(runtime))
+{
+    return;
+}
+
 #endif /* WITH_PYMALLOC */
 
 
@@ -2289,6 +2423,7 @@ _PyObject_DebugMallocStats(FILE *out)
     if (!_PyMem_PymallocEnabled()) {
         return 0;
     }
+    OMState *state = get_state();
 
     uint i;
     const uint numclasses = SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT;
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index a510c9b22168bc..ebf1a0bff54eb0 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -547,11 +547,21 @@ pycore_init_runtime(_PyRuntimeState *runtime,
 }
 
 
-static void
+static PyStatus
 init_interp_settings(PyInterpreterState *interp, const _PyInterpreterConfig *config)
 {
     assert(interp->feature_flags == 0);
 
+    if (config->use_main_obmalloc) {
+        interp->feature_flags |= Py_RTFLAGS_USE_MAIN_OBMALLOC;
+    }
+    else if (!config->check_multi_interp_extensions) {
+        /* The reason: PyModuleDef.m_base.m_copy leaks objects between
+           interpreters. */
+        return _PyStatus_ERR("per-interpreter obmalloc does not support "
+                             "single-phase init extension modules");
+    }
+
     if (config->allow_fork) {
         interp->feature_flags |= Py_RTFLAGS_FORK;
     }
@@ -570,6 +580,8 @@ init_interp_settings(PyInterpreterState *interp, const _PyInterpreterConfig *con
     if (config->check_multi_interp_extensions) {
         interp->feature_flags |= Py_RTFLAGS_MULTI_INTERP_EXTENSIONS;
     }
+
+    return _PyStatus_OK();
 }
 
 
@@ -622,7 +634,10 @@ pycore_create_interpreter(_PyRuntimeState *runtime,
     }
 
     const _PyInterpreterConfig config = _PyInterpreterConfig_LEGACY_INIT;
-    init_interp_settings(interp, &config);
+    status = init_interp_settings(interp, &config);
+    if (_PyStatus_EXCEPTION(status)) {
+        return status;
+    }
 
     PyThreadState *tstate = _PyThreadState_New(interp);
     if (tstate == NULL) {
@@ -1668,6 +1683,8 @@ finalize_interp_types(PyInterpreterState *interp)
     _PyFloat_FiniType(interp);
     _PyLong_FiniTypes(interp);
     _PyThread_FiniType(interp);
+    // XXX fini collections module static types (_PyStaticType_Dealloc())
+    // XXX fini IO module static types (_PyStaticType_Dealloc())
     _PyErr_FiniTypes(interp);
     _PyTypes_FiniTypes(interp);
 
@@ -1936,6 +1953,7 @@ Py_FinalizeEx(void)
     }
     _Py_FinalizeRefTotal(runtime);
 #endif
+    _Py_FinalizeAllocatedBlocks(runtime);
 
 #ifdef Py_TRACE_REFS
     /* Display addresses (& refcnts) of all objects still alive.
@@ -2036,7 +2054,10 @@ new_interpreter(PyThreadState **tstate_p, const _PyInterpreterConfig *config)
         goto error;
     }
 
-    init_interp_settings(interp, config);
+    status = init_interp_settings(interp, config);
+    if (_PyStatus_EXCEPTION(status)) {
+        goto error;
+    }
 
     status = init_interp_create_gil(tstate);
     if (_PyStatus_EXCEPTION(status)) {
diff --git a/Python/pystate.c b/Python/pystate.c
index d108cfc7e50a0a..b2ef7e2dddeeba 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -671,6 +671,14 @@ init_interpreter(PyInterpreterState *interp,
     assert(next != NULL || (interp == runtime->interpreters.main));
     interp->next = next;
 
+    /* Initialize obmalloc, but only for subinterpreters,
+       since the main interpreter is initialized statically. */
+    if (interp != &runtime->_main_interpreter) {
+        poolp temp[OBMALLOC_USED_POOLS_SIZE] = \
+                _obmalloc_pools_INIT(interp->obmalloc.pools);
+        memcpy(&interp->obmalloc.pools.used, temp, sizeof(temp));
+    }
+
     _PyEval_InitState(&interp->ceval, pending_lock);
     _PyGC_InitState(&interp->gc);
     PyConfig_InitPythonConfig(&interp->config);
@@ -941,11 +949,12 @@ PyInterpreterState_Delete(PyInterpreterState *interp)
 
     _PyEval_FiniState(&interp->ceval);
 
-#ifdef Py_REF_DEBUG
-    // XXX This call should be done at the end of clear_interpreter(),
+    // XXX These two calls should be done at the end of clear_interpreter(),
     // but currently some objects get decref'ed after that.
+#ifdef Py_REF_DEBUG
     _PyInterpreterState_FinalizeRefTotal(interp);
 #endif
+    _PyInterpreterState_FinalizeAllocatedBlocks(interp);
 
     HEAD_LOCK(runtime);
     PyInterpreterState **p;
@@ -2320,11 +2329,11 @@ _PyCrossInterpreterData_InitWithSize(_PyCrossInterpreterData *data,
     // where it was allocated, so the interpreter is required.
     assert(interp != NULL);
     _PyCrossInterpreterData_Init(data, interp, NULL, obj, new_object);
-    data->data = PyMem_Malloc(size);
+    data->data = PyMem_RawMalloc(size);
     if (data->data == NULL) {
         return -1;
     }
-    data->free = PyMem_Free;
+    data->free = PyMem_RawFree;
     return 0;
 }
 
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 1e42e8dfceb5cc..58ed48859b5f3a 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -1871,7 +1871,9 @@ static Py_ssize_t
 sys_getallocatedblocks_impl(PyObject *module)
 /*[clinic end generated code: output=f0c4e873f0b6dcf7 input=dab13ee346a0673e]*/
 {
-    return _Py_GetAllocatedBlocks();
+    // It might make sense to return the count
+    // for just the current interpreter.
+    return _Py_GetGlobalAllocatedBlocks();
 }
 
 /*[clinic input]
diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv
index a8ba88efc732fb..7a5d7d45f5184b 100644
--- a/Tools/c-analyzer/cpython/ignored.tsv
+++ b/Tools/c-analyzer/cpython/ignored.tsv
@@ -309,6 +309,7 @@ Objects/obmalloc.c	-	_PyMem	-
 Objects/obmalloc.c	-	_PyMem_Debug	-
 Objects/obmalloc.c	-	_PyMem_Raw	-
 Objects/obmalloc.c	-	_PyObject	-
+Objects/obmalloc.c	-	last_final_leaks	-
 Objects/obmalloc.c	-	usedpools	-
 Objects/typeobject.c	-	name_op	-
 Objects/typeobject.c	-	slotdefs	-