Skip to content

Commit 4fee43b

Browse files
amotin0mp
authored andcommitted
L2ARC: Relax locking during write
Previous code held ARC state sublist lock throughout all L2ARC write process, which included number of allocations and even ZIO issues. Being blocked in any of those places the code could also block ARC eviction, that could cause OOM activation or even dead- lock if system is low on memory or one is too fragmented. Fix it by dropping the lock as soon as we see a block eligible for L2ARC writing and pick it up later using earlier inserted marker. While there, also reduce scope of hash lock, moving ZIO allocation and other operations not requiring header access out of it. All operations requiring header access move under hash lock, since L2_WRITING flag does not prevent header eviction only transition to arc_l2c_only state with L1 header. To be able to manipulate sublist lock and marker as needed add few more multilist functions and modify one. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes openzfs#16040 (cherry picked from commit 997f85b) Signed-off-by: Mateusz Piotrowski <[email protected]>
1 parent bff2363 commit 4fee43b

File tree

6 files changed

+133
-104
lines changed

6 files changed

+133
-104
lines changed

include/sys/multilist.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
8282
unsigned int multilist_get_num_sublists(multilist_t *);
8383
unsigned int multilist_get_random_index(multilist_t *);
8484

85-
multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
85+
void multilist_sublist_lock(multilist_sublist_t *);
86+
multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
8687
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
8788
void multilist_sublist_unlock(multilist_sublist_t *);
8889

8990
void multilist_sublist_insert_head(multilist_sublist_t *, void *);
9091
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
92+
void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
93+
void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
9194
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
9295
void multilist_sublist_remove(multilist_sublist_t *, void *);
9396
int multilist_sublist_is_empty(multilist_sublist_t *);

module/zfs/arc.c

Lines changed: 95 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -4065,7 +4065,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
40654065

40664066
ASSERT3P(marker, !=, NULL);
40674067

4068-
mls = multilist_sublist_lock(ml, idx);
4068+
mls = multilist_sublist_lock_idx(ml, idx);
40694069

40704070
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
40714071
hdr = multilist_sublist_prev(mls, marker)) {
@@ -4178,6 +4178,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
41784178
return (bytes_evicted);
41794179
}
41804180

4181+
static arc_buf_hdr_t *
4182+
arc_state_alloc_marker(void)
4183+
{
4184+
arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4185+
4186+
/*
4187+
* A b_spa of 0 is used to indicate that this header is
4188+
* a marker. This fact is used in arc_evict_state_impl().
4189+
*/
4190+
marker->b_spa = 0;
4191+
4192+
return (marker);
4193+
}
4194+
4195+
static void
4196+
arc_state_free_marker(arc_buf_hdr_t *marker)
4197+
{
4198+
kmem_cache_free(hdr_full_cache, marker);
4199+
}
4200+
41814201
/*
41824202
* Allocate an array of buffer headers used as placeholders during arc state
41834203
* eviction.
@@ -4188,25 +4208,16 @@ arc_state_alloc_markers(int count)
41884208
arc_buf_hdr_t **markers;
41894209

41904210
markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
4191-
for (int i = 0; i < count; i++) {
4192-
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4193-
4194-
/*
4195-
* A b_spa of 0 is used to indicate that this header is
4196-
* a marker. This fact is used in arc_evict_type() and
4197-
* arc_evict_state_impl().
4198-
*/
4199-
markers[i]->b_spa = 0;
4200-
4201-
}
4211+
for (int i = 0; i < count; i++)
4212+
markers[i] = arc_state_alloc_marker();
42024213
return (markers);
42034214
}
42044215

42054216
static void
42064217
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
42074218
{
42084219
for (int i = 0; i < count; i++)
4209-
kmem_cache_free(hdr_full_cache, markers[i]);
4220+
arc_state_free_marker(markers[i]);
42104221
kmem_free(markers, sizeof (*markers) * count);
42114222
}
42124223

@@ -4250,7 +4261,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
42504261
for (int i = 0; i < num_sublists; i++) {
42514262
multilist_sublist_t *mls;
42524263

4253-
mls = multilist_sublist_lock(ml, i);
4264+
mls = multilist_sublist_lock_idx(ml, i);
42544265
multilist_sublist_insert_tail(mls, markers[i]);
42554266
multilist_sublist_unlock(mls);
42564267
}
@@ -4328,7 +4339,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
43284339
}
43294340

43304341
for (int i = 0; i < num_sublists; i++) {
4331-
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4342+
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
43324343
multilist_sublist_remove(mls, markers[i]);
43334344
multilist_sublist_unlock(mls);
43344345
}
@@ -4568,8 +4579,8 @@ arc_evict_type(arc_state_t *state)
45684579
* We keep the sublist lock until we're finished, to prevent
45694580
* the headers from being destroyed via arc_evict_state().
45704581
*/
4571-
data_mls = multilist_sublist_lock(data_ml, data_idx);
4572-
meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4582+
data_mls = multilist_sublist_lock_idx(data_ml, data_idx);
4583+
meta_mls = multilist_sublist_lock_idx(meta_ml, meta_idx);
45734584

45744585
/*
45754586
* These two loops are to ensure we skip any markers that
@@ -9139,7 +9150,7 @@ l2arc_sublist_lock(int list_num)
91399150
* sublists being selected.
91409151
*/
91419152
idx = multilist_get_random_index(ml);
9142-
return (multilist_sublist_lock(ml, idx));
9153+
return (multilist_sublist_lock_idx(ml, idx));
91439154
}
91449155

91459156
/*
@@ -9569,9 +9580,9 @@ l2arc_blk_fetch_done(zio_t *zio)
95699580
static uint64_t
95709581
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
95719582
{
9572-
arc_buf_hdr_t *hdr, *hdr_prev, *head;
9573-
uint64_t write_asize, write_psize, write_lsize, headroom;
9574-
boolean_t full;
9583+
arc_buf_hdr_t *hdr, *head, *marker;
9584+
uint64_t write_asize, write_psize, headroom;
9585+
boolean_t full, from_head = !arc_warm;
95759586
l2arc_write_callback_t *cb = NULL;
95769587
zio_t *pio, *wzio;
95779588
uint64_t guid = spa_load_guid(spa);
@@ -9580,10 +9591,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
95809591
ASSERT3P(dev->l2ad_vdev, !=, NULL);
95819592

95829593
pio = NULL;
9583-
write_lsize = write_asize = write_psize = 0;
9594+
write_asize = write_psize = 0;
95849595
full = B_FALSE;
95859596
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
95869597
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
9598+
marker = arc_state_alloc_marker();
95879599

95889600
/*
95899601
* Copy buffers for L2ARC writing.
@@ -9598,40 +9610,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
95989610
continue;
95999611
}
96009612

9601-
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
96029613
uint64_t passed_sz = 0;
9603-
9604-
VERIFY3P(mls, !=, NULL);
9614+
headroom = target_sz * l2arc_headroom;
9615+
if (zfs_compressed_arc_enabled)
9616+
headroom = (headroom * l2arc_headroom_boost) / 100;
96059617

96069618
/*
9607-
* L2ARC fast warmup.
9608-
*
96099619
* Until the ARC is warm and starts to evict, read from the
96109620
* head of the ARC lists rather than the tail.
96119621
*/
9612-
if (arc_warm == B_FALSE)
9622+
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
9623+
ASSERT3P(mls, !=, NULL);
9624+
if (from_head)
96139625
hdr = multilist_sublist_head(mls);
96149626
else
96159627
hdr = multilist_sublist_tail(mls);
96169628

9617-
headroom = target_sz * l2arc_headroom;
9618-
if (zfs_compressed_arc_enabled)
9619-
headroom = (headroom * l2arc_headroom_boost) / 100;
9620-
9621-
for (; hdr; hdr = hdr_prev) {
9629+
while (hdr != NULL) {
96229630
kmutex_t *hash_lock;
96239631
abd_t *to_write = NULL;
96249632

9625-
if (arc_warm == B_FALSE)
9626-
hdr_prev = multilist_sublist_next(mls, hdr);
9627-
else
9628-
hdr_prev = multilist_sublist_prev(mls, hdr);
9629-
96309633
hash_lock = HDR_LOCK(hdr);
96319634
if (!mutex_tryenter(hash_lock)) {
9632-
/*
9633-
* Skip this buffer rather than waiting.
9634-
*/
9635+
skip:
9636+
/* Skip this buffer rather than waiting. */
9637+
if (from_head)
9638+
hdr = multilist_sublist_next(mls, hdr);
9639+
else
9640+
hdr = multilist_sublist_prev(mls, hdr);
96359641
continue;
96369642
}
96379643

@@ -9646,7 +9652,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
96469652

96479653
if (!l2arc_write_eligible(guid, hdr)) {
96489654
mutex_exit(hash_lock);
9649-
continue;
9655+
goto skip;
96509656
}
96519657

96529658
/*
@@ -9656,7 +9662,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
96569662
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
96579663
*/
96589664
ASSERT(HDR_HAS_L1HDR(hdr));
9659-
96609665
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
96619666
ASSERT3U(arc_hdr_size(hdr), >, 0);
96629667
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9672,18 +9677,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
96729677
}
96739678

96749679
/*
9675-
* We rely on the L1 portion of the header below, so
9676-
* it's invalid for this header to have been evicted out
9677-
* of the ghost cache, prior to being written out. The
9678-
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
9680+
* We should not sleep with sublist lock held or it
9681+
* may block ARC eviction. Insert a marker to save
9682+
* the position and drop the lock.
96799683
*/
9680-
arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
9681-
ASSERT(HDR_HAS_L1HDR(hdr));
9682-
9683-
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
9684-
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
9685-
HDR_HAS_RABD(hdr));
9686-
ASSERT3U(arc_hdr_size(hdr), >, 0);
9684+
if (from_head) {
9685+
multilist_sublist_insert_after(mls, hdr,
9686+
marker);
9687+
} else {
9688+
multilist_sublist_insert_before(mls, hdr,
9689+
marker);
9690+
}
9691+
multilist_sublist_unlock(mls);
96879692

96889693
/*
96899694
* If this header has b_rabd, we can use this since it
@@ -9714,9 +9719,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
97149719
&to_write);
97159720
if (ret != 0) {
97169721
arc_hdr_clear_flags(hdr,
9717-
ARC_FLAG_L2_WRITING);
9722+
ARC_FLAG_L2CACHE);
97189723
mutex_exit(hash_lock);
9719-
continue;
9724+
goto next;
97209725
}
97219726

97229727
l2arc_free_abd_on_write(to_write, asize, type);
@@ -9725,73 +9730,70 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
97259730
/* l2arc_hdr_arcstats_update() expects a valid asize */
97269731
HDR_SET_L2SIZE(hdr, asize);
97279732

9733+
hdr->b_l2hdr.b_dev = dev;
9734+
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
9735+
hdr->b_l2hdr.b_hits = 0;
9736+
hdr->b_l2hdr.b_arcs_state =
9737+
hdr->b_l1hdr.b_state->arcs_state;
9738+
mutex_enter(&dev->l2ad_mtx);
97289739
if (pio == NULL) {
97299740
/*
97309741
* Insert a dummy header on the buflist so
97319742
* l2arc_write_done() can find where the
97329743
* write buffers begin without searching.
97339744
*/
9734-
mutex_enter(&dev->l2ad_mtx);
97359745
list_insert_head(&dev->l2ad_buflist, head);
9736-
mutex_exit(&dev->l2ad_mtx);
9746+
}
9747+
list_insert_head(&dev->l2ad_buflist, hdr);
9748+
mutex_exit(&dev->l2ad_mtx);
9749+
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
9750+
ARC_FLAG_L2_WRITING);
97379751

9752+
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
9753+
arc_hdr_size(hdr), hdr);
9754+
l2arc_hdr_arcstats_increment(hdr);
9755+
9756+
boolean_t commit = l2arc_log_blk_insert(dev, hdr);
9757+
mutex_exit(hash_lock);
9758+
9759+
if (pio == NULL) {
97389760
cb = kmem_alloc(
97399761
sizeof (l2arc_write_callback_t), KM_SLEEP);
97409762
cb->l2wcb_dev = dev;
97419763
cb->l2wcb_head = head;
9742-
/*
9743-
* Create a list to save allocated abd buffers
9744-
* for l2arc_log_blk_commit().
9745-
*/
97469764
list_create(&cb->l2wcb_abd_list,
97479765
sizeof (l2arc_lb_abd_buf_t),
97489766
offsetof(l2arc_lb_abd_buf_t, node));
97499767
pio = zio_root(spa, l2arc_write_done, cb,
97509768
ZIO_FLAG_CANFAIL);
97519769
}
97529770

9753-
hdr->b_l2hdr.b_dev = dev;
9754-
hdr->b_l2hdr.b_hits = 0;
9755-
9756-
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
9757-
hdr->b_l2hdr.b_arcs_state =
9758-
hdr->b_l1hdr.b_state->arcs_state;
9759-
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
9760-
9761-
mutex_enter(&dev->l2ad_mtx);
9762-
list_insert_head(&dev->l2ad_buflist, hdr);
9763-
mutex_exit(&dev->l2ad_mtx);
9764-
9765-
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
9766-
arc_hdr_size(hdr), hdr);
9767-
97689771
wzio = zio_write_phys(pio, dev->l2ad_vdev,
9769-
hdr->b_l2hdr.b_daddr, asize, to_write,
9772+
dev->l2ad_hand, asize, to_write,
97709773
ZIO_CHECKSUM_OFF, NULL, hdr,
97719774
ZIO_PRIORITY_ASYNC_WRITE,
97729775
ZIO_FLAG_CANFAIL, B_FALSE);
97739776

9774-
write_lsize += HDR_GET_LSIZE(hdr);
97759777
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
97769778
zio_t *, wzio);
9779+
zio_nowait(wzio);
97779780

97789781
write_psize += psize;
97799782
write_asize += asize;
97809783
dev->l2ad_hand += asize;
9781-
l2arc_hdr_arcstats_increment(hdr);
97829784
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
97839785

9784-
mutex_exit(hash_lock);
9785-
9786-
/*
9787-
* Append buf info to current log and commit if full.
9788-
* arcstat_l2_{size,asize} kstats are updated
9789-
* internally.
9790-
*/
9791-
if (l2arc_log_blk_insert(dev, hdr))
9786+
if (commit) {
97929787
l2arc_log_blk_commit(dev, pio, cb);
9788+
}
97939789

9794-
zio_nowait(wzio);
9790+
next:
9791+
multilist_sublist_lock(mls);
9792+
if (from_head)
9793+
hdr = multilist_sublist_next(mls, marker);
9794+
else
9795+
hdr = multilist_sublist_prev(mls, marker);
9796+
multilist_sublist_remove(mls, marker);
97959797
}
97969798

97979799
multilist_sublist_unlock(mls);
@@ -9800,9 +9802,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
98009802
break;
98019803
}
98029804

9805+
arc_state_free_marker(marker);
9806+
98039807
/* No buffers selected for writing? */
98049808
if (pio == NULL) {
9805-
ASSERT0(write_lsize);
9809+
ASSERT0(write_psize);
98069810
ASSERT(!HDR_HAS_L1HDR(head));
98079811
kmem_cache_free(hdr_l2only_cache, head);
98089812

@@ -11229,7 +11233,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
1122911233
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
1123011234
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
1123111235
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
11232-
L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
11236+
L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
1123311237

1123411238
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
1123511239
HDR_GET_PSIZE(hdr));

module/zfs/dbuf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,7 @@ static void
692692
dbuf_evict_one(void)
693693
{
694694
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
695-
multilist_sublist_t *mls = multilist_sublist_lock(
695+
multilist_sublist_t *mls = multilist_sublist_lock_idx(
696696
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
697697

698698
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));

0 commit comments

Comments
 (0)