Skip to content

Commit 92dc4ad

Browse files
authored
Consider dnode_t allocations in dbuf cache size accounting
Entries in the dbuf cache contribute only the size of the dbuf data to the cache size. Attached "user" data is not counted. This can lead to the data currently "owned" by the cache consuming more memory accounting appears to show. In some cases (eg a metadnode data block with all child dnode_t slots allocated), the actual size can be as much as 3x as what the cache believes it to be. This is arguably correct behaviour, as the cache is only tracking the size of the dbuf data, not even the overhead of the dbuf_t. On the other hand, in the above case of dnodes, evicting cached metadnode dbufs is the only current way to reclaim the dnode objects, and can lead to the situation where the dbuf cache appears to be comfortably within its target memory window and yet is holding enormous amounts of slab memory that cannot be reclaimed. This commit adds a facility for a dbuf user to artificially inflate the apparent size of the dbuf for caching purposes. This at least allows for cache tuning to be adjusted to match something closer to the real memory overhead. metadnode dbufs carry a >1KiB allocation per dnode in their user data. This informs the dbuf cache machinery of that fact, allowing it to make better decisions when evicting dbufs. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes openzfs#15511
1 parent 6c6fae6 commit 92dc4ad

File tree

5 files changed

+102
-26
lines changed

5 files changed

+102
-26
lines changed

cmd/dbufstat.in

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ import re
3737

3838
bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
3939
bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
40-
"meta", "state", "dbholds", "dbc", "list", "atype", "flags",
40+
"usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
4141
"count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
4242
"l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
4343
"data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
@@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"]
4747
dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
4848
"bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
4949
"indirect", "bonus", "spill"]
50-
dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
51-
"dbc", "list", "atype", "flags", "count", "asize", "access",
52-
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
53-
"l2_comp", "aholds"]
50+
dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state",
51+
"dbholds", "dbc", "list", "atype", "flags", "count", "asize",
52+
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
53+
"l2_asize", "l2_comp", "aholds"]
5454

5555
thdr = ["pool", "objset", "dtype", "cached"]
5656
txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
5757
"bonus", "spill"]
58-
tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
59-
"dbc", "dbholds", "list", "atype", "flags", "count", "asize",
60-
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
58+
tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta",
59+
"state", "dbc", "dbholds", "list", "atype", "flags", "count",
60+
"asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
6161
"l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
6262
"bsize", "lvls", "dholds", "blocks", "dsize"]
6363

@@ -70,6 +70,7 @@ cols = {
7070
"blkid": [8, -1, "block number of buffer"],
7171
"offset": [12, 1024, "offset in object of buffer"],
7272
"dbsize": [7, 1024, "size of buffer"],
73+
"usize": [7, 1024, "size of attached user data"],
7374
"meta": [4, -1, "is this buffer metadata?"],
7475
"state": [5, -1, "state of buffer (read, cached, etc)"],
7576
"dbholds": [7, 1000, "number of holds on buffer"],
@@ -399,6 +400,7 @@ def update_dict(d, k, line, labels):
399400
key = line[labels[k]]
400401

401402
dbsize = int(line[labels['dbsize']])
403+
usize = int(line[labels['usize']])
402404
blkid = int(line[labels['blkid']])
403405
level = int(line[labels['level']])
404406

@@ -416,7 +418,7 @@ def update_dict(d, k, line, labels):
416418
d[pool][objset][key]['indirect'] = 0
417419
d[pool][objset][key]['spill'] = 0
418420

419-
d[pool][objset][key]['cached'] += dbsize
421+
d[pool][objset][key]['cached'] += dbsize + usize
420422

421423
if blkid == -1:
422424
d[pool][objset][key]['bonus'] += dbsize

include/sys/dmu.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,9 @@ typedef struct dmu_buf_user {
652652
*/
653653
taskq_ent_t dbu_tqent;
654654

655+
/* Size of user data, for inclusion in dbuf_cache accounting. */
656+
uint64_t dbu_size;
657+
655658
/*
656659
* This instance's eviction function pointers.
657660
*
@@ -733,6 +736,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db,
733736
*/
734737
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
735738

739+
/*
740+
* User data size accounting. This can be used to artifically inflate the size
741+
* of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
742+
* to satisfy memory reclaim requests. It's not used for anything else, and
743+
* defaults to 0.
744+
*/
745+
uint64_t dmu_buf_user_size(dmu_buf_t *db);
746+
void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
747+
void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
748+
736749
/*
737750
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
738751
*/

module/zfs/dbuf.c

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
569569
*dbu->dbu_clear_on_evict_dbufp = NULL;
570570
#endif
571571

572+
if (db->db_caching_status != DB_NO_CACHE) {
573+
/*
574+
* This is a cached dbuf, so the size of the user data is
575+
* included in its cached amount. We adjust it here because the
576+
* user data has already been detached from the dbuf, and the
577+
* sync functions are not supposed to touch it (the dbuf might
578+
* not exist anymore by the time the sync functions run.
579+
*/
580+
uint64_t size = dbu->dbu_size;
581+
(void) zfs_refcount_remove_many(
582+
&dbuf_caches[db->db_caching_status].size, size, db);
583+
if (db->db_caching_status == DB_DBUF_CACHE)
584+
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
585+
}
586+
572587
/*
573588
* There are two eviction callbacks - one that we call synchronously
574589
* and one that we invoke via a taskq. The async one is useful for
@@ -770,12 +785,12 @@ dbuf_evict_one(void)
770785
if (db != NULL) {
771786
multilist_sublist_remove(mls, db);
772787
multilist_sublist_unlock(mls);
788+
uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
773789
(void) zfs_refcount_remove_many(
774-
&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
790+
&dbuf_caches[DB_DBUF_CACHE].size, size, db);
775791
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
776792
DBUF_STAT_BUMPDOWN(cache_count);
777-
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
778-
db->db.db_size);
793+
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
779794
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
780795
db->db_caching_status = DB_NO_CACHE;
781796
dbuf_destroy(db);
@@ -3002,6 +3017,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
30023017
db->db_caching_status == DB_DBUF_METADATA_CACHE);
30033018

30043019
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3020+
3021+
ASSERT0(dmu_buf_user_size(&db->db));
30053022
(void) zfs_refcount_remove_many(
30063023
&dbuf_caches[db->db_caching_status].size,
30073024
db->db.db_size, db);
@@ -3749,17 +3766,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
37493766
db->db_caching_status == DB_DBUF_METADATA_CACHE);
37503767

37513768
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3769+
3770+
uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
37523771
(void) zfs_refcount_remove_many(
3753-
&dbuf_caches[db->db_caching_status].size,
3754-
db->db.db_size, db);
3772+
&dbuf_caches[db->db_caching_status].size, size, db);
37553773

37563774
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
37573775
DBUF_STAT_BUMPDOWN(metadata_cache_count);
37583776
} else {
37593777
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
37603778
DBUF_STAT_BUMPDOWN(cache_count);
3761-
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
3762-
db->db.db_size);
3779+
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
37633780
}
37643781
db->db_caching_status = DB_NO_CACHE;
37653782
}
@@ -3978,7 +3995,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
39783995
db->db_caching_status = dcs;
39793996

39803997
multilist_insert(&dbuf_caches[dcs].cache, db);
3981-
uint64_t db_size = db->db.db_size;
3998+
uint64_t db_size = db->db.db_size +
3999+
dmu_buf_user_size(&db->db);
39824000
size = zfs_refcount_add_many(
39834001
&dbuf_caches[dcs].size, db_size, db);
39844002
uint8_t db_level = db->db_level;
@@ -4074,6 +4092,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
40744092
return (db->db_user);
40754093
}
40764094

4095+
uint64_t
4096+
dmu_buf_user_size(dmu_buf_t *db_fake)
4097+
{
4098+
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4099+
if (db->db_user == NULL)
4100+
return (0);
4101+
return (atomic_load_64(&db->db_user->dbu_size));
4102+
}
4103+
4104+
void
4105+
dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
4106+
{
4107+
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4108+
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4109+
ASSERT3P(db->db_user, !=, NULL);
4110+
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
4111+
atomic_add_64(&db->db_user->dbu_size, nadd);
4112+
}
4113+
4114+
void
4115+
dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
4116+
{
4117+
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
4118+
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
4119+
ASSERT3P(db->db_user, !=, NULL);
4120+
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
4121+
atomic_sub_64(&db->db_user->dbu_size, nsub);
4122+
}
4123+
40774124
void
40784125
dmu_buf_user_evict_wait(void)
40794126
{

module/zfs/dbuf_stats.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,14 @@ static int
4646
dbuf_stats_hash_table_headers(char *buf, size_t size)
4747
{
4848
(void) snprintf(buf, size,
49-
"%-96s | %-119s | %s\n"
50-
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
49+
"%-105s | %-119s | %s\n"
50+
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
5151
"%-5s %-5s %-9s %-6s %-8s %-12s "
5252
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
5353
"%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
5454
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
55-
"blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
56-
"list", "atype", "flags", "count", "asize", "access",
55+
"blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
56+
"dbc", "list", "atype", "flags", "count", "asize", "access",
5757
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
5858
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
5959
"bsize", "lvls", "dholds", "blocks", "dsize");
@@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
7575
__dmu_object_info_from_dnode(dn, &doi);
7676

7777
nwritten = snprintf(buf, size,
78-
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
79-
"%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
78+
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
79+
"%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
8080
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
8181
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
8282
/* dmu_buf_impl_t */
@@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
8787
(longlong_t)db->db_blkid,
8888
(u_longlong_t)db->db.db_offset,
8989
(u_longlong_t)db->db.db_size,
90+
(u_longlong_t)dmu_buf_user_size(&db->db),
9091
!!dbuf_is_metadata(db),
9192
db->db_state,
9293
(ulong_t)zfs_refcount_count(&db->db_holds),

module/zfs/dnode.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,9 +1237,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
12371237
return (B_TRUE);
12381238
}
12391239

1240-
static void
1240+
static uint_t
12411241
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
12421242
{
1243+
uint_t reclaimed = 0;
1244+
12431245
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
12441246

12451247
for (int i = idx; i < idx + slots; i++) {
@@ -1251,8 +1253,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
12511253
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
12521254
dnode_destroy(dnh->dnh_dnode);
12531255
dnh->dnh_dnode = DN_SLOT_FREE;
1256+
reclaimed++;
12541257
}
12551258
}
1259+
1260+
return (reclaimed);
12561261
}
12571262

12581263
void
@@ -1565,6 +1570,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
15651570
} else {
15661571
dn = dnode_create(os, dn_block + idx, db,
15671572
object, dnh);
1573+
dmu_buf_add_user_size(&db->db,
1574+
sizeof (dnode_t));
15681575
}
15691576
}
15701577

@@ -1622,15 +1629,21 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
16221629
* to be freed. Single slot dnodes can be safely
16231630
* re-purposed as a performance optimization.
16241631
*/
1625-
if (slots > 1)
1626-
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
1632+
if (slots > 1) {
1633+
uint_t reclaimed =
1634+
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
1635+
if (reclaimed > 0)
1636+
dmu_buf_sub_user_size(&db->db,
1637+
reclaimed * sizeof (dnode_t));
1638+
}
16271639

16281640
dnh = &dnc->dnc_children[idx];
16291641
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
16301642
dn = dnh->dnh_dnode;
16311643
} else {
16321644
dn = dnode_create(os, dn_block + idx, db,
16331645
object, dnh);
1646+
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
16341647
}
16351648

16361649
mutex_enter(&dn->dn_mtx);

0 commit comments

Comments
 (0)