Skip to content

Consider dnode_t allocations in dbuf cache size accounting #15511

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions cmd/dbufstat.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import re

bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
"meta", "state", "dbholds", "dbc", "list", "atype", "flags",
"usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
"count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
"l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
"data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
Expand All @@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"]
dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
"indirect", "bonus", "spill"]
dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
"dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds"]
dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state",
"dbholds", "dbc", "list", "atype", "flags", "count", "asize",
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds"]

thdr = ["pool", "objset", "dtype", "cached"]
txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
"bonus", "spill"]
tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
"dbc", "dbholds", "list", "atype", "flags", "count", "asize",
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta",
"state", "dbc", "dbholds", "list", "atype", "flags", "count",
"asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize"]

Expand All @@ -70,6 +70,7 @@ cols = {
"blkid": [8, -1, "block number of buffer"],
"offset": [12, 1024, "offset in object of buffer"],
"dbsize": [7, 1024, "size of buffer"],
"usize": [7, 1024, "size of attached user data"],
"meta": [4, -1, "is this buffer metadata?"],
"state": [5, -1, "state of buffer (read, cached, etc)"],
"dbholds": [7, 1000, "number of holds on buffer"],
Expand Down Expand Up @@ -399,6 +400,7 @@ def update_dict(d, k, line, labels):
key = line[labels[k]]

dbsize = int(line[labels['dbsize']])
usize = int(line[labels['usize']])
blkid = int(line[labels['blkid']])
level = int(line[labels['level']])

Expand All @@ -416,7 +418,7 @@ def update_dict(d, k, line, labels):
d[pool][objset][key]['indirect'] = 0
d[pool][objset][key]['spill'] = 0

d[pool][objset][key]['cached'] += dbsize
d[pool][objset][key]['cached'] += dbsize + usize

if blkid == -1:
d[pool][objset][key]['bonus'] += dbsize
Expand Down
13 changes: 13 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,9 @@ typedef struct dmu_buf_user {
*/
taskq_ent_t dbu_tqent;

/* Size of user data, for inclusion in dbuf_cache accounting. */
uint64_t dbu_size;

/*
* This instance's eviction function pointers.
*
Expand Down Expand Up @@ -733,6 +736,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db,
*/
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);

/*
* User data size accounting. This can be used to artifically inflate the size
* of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
* to satisfy memory reclaim requests. It's not used for anything else, and
* defaults to 0.
*/
uint64_t dmu_buf_user_size(dmu_buf_t *db);
void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);

/*
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
*/
Expand Down
63 changes: 55 additions & 8 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif

if (db->db_caching_status != DB_NO_CACHE) {
/*
* This is a cached dbuf, so the size of the user data is
* included in its cached amount. We adjust it here because the
* user data has already been detached from the dbuf, and the
* sync functions are not supposed to touch it (the dbuf might
* not exist anymore by the time the sync functions run.
*/
uint64_t size = dbu->dbu_size;
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_CACHE)
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}

/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
Expand Down Expand Up @@ -770,12 +785,12 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
&dbuf_caches[DB_DBUF_CACHE].size, size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
Expand Down Expand Up @@ -3002,6 +3017,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_caching_status == DB_DBUF_METADATA_CACHE);

multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);

ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
Expand Down Expand Up @@ -3749,17 +3766,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_caching_status == DB_DBUF_METADATA_CACHE);

multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);

uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
&dbuf_caches[db->db_caching_status].size, size, db);

if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
db->db_caching_status = DB_NO_CACHE;
}
Expand Down Expand Up @@ -3978,7 +3995,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
db->db_caching_status = dcs;

multilist_insert(&dbuf_caches[dcs].cache, db);
uint64_t db_size = db->db.db_size;
uint64_t db_size = db->db.db_size +
dmu_buf_user_size(&db->db);
size = zfs_refcount_add_many(
&dbuf_caches[dcs].size, db_size, db);
uint8_t db_level = db->db_level;
Expand Down Expand Up @@ -4074,6 +4092,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user);
}

uint64_t
dmu_buf_user_size(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
if (db->db_user == NULL)
return (0);
return (atomic_load_64(&db->db_user->dbu_size));
}

void
dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT3P(db->db_user, !=, NULL);
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
atomic_add_64(&db->db_user->dbu_size, nadd);
}

void
dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT3P(db->db_user, !=, NULL);
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
atomic_sub_64(&db->db_user->dbu_size, nsub);
}

void
dmu_buf_user_evict_wait(void)
{
Expand Down
13 changes: 7 additions & 6 deletions module/zfs/dbuf_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ static int
dbuf_stats_hash_table_headers(char *buf, size_t size)
{
(void) snprintf(buf, size,
"%-96s | %-119s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
"%-105s | %-119s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
"%-5s %-5s %-9s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
"blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
"list", "atype", "flags", "count", "asize", "access",
"blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
"dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize");
Expand All @@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
__dmu_object_info_from_dnode(dn, &doi);

nwritten = snprintf(buf, size,
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
"%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
"%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
Expand All @@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
(longlong_t)db->db_blkid,
(u_longlong_t)db->db.db_offset,
(u_longlong_t)db->db.db_size,
(u_longlong_t)dmu_buf_user_size(&db->db),
!!dbuf_is_metadata(db),
db->db_state,
(ulong_t)zfs_refcount_count(&db->db_holds),
Expand Down
19 changes: 16 additions & 3 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1237,9 +1237,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
return (B_TRUE);
}

static void
static uint_t
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
uint_t reclaimed = 0;

ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);

for (int i = idx; i < idx + slots; i++) {
Expand All @@ -1251,8 +1253,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
reclaimed++;
}
}

return (reclaimed);
}

void
Expand Down Expand Up @@ -1565,6 +1570,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db,
sizeof (dnode_t));
}
}

Expand Down Expand Up @@ -1622,15 +1629,21 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
if (slots > 1)
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
if (slots > 1) {
uint_t reclaimed =
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
if (reclaimed > 0)
dmu_buf_sub_user_size(&db->db,
reclaimed * sizeof (dnode_t));
}

dnh = &dnc->dnc_children[idx];
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
dn = dnh->dnh_dnode;
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}

mutex_enter(&dn->dn_mtx);
Expand Down