Skip to content

Commit f4480ee

Browse files
committed
zio: add separate pipeline stages for logical IO
The "logical" IO responsible for farming work out to the vdevs goes through the VDEV_IO stages, even though it does no IO itself, does not have a vdev set, and is not a "vdev" child IO. This means the VDEV_IO stages need special handling for this particular kind of IO, some of it totally irrelevant to real vdev IO (eg config lock, retry, etc). It also leads to some confusing asymmetries, eg the DVA throttle is held in the logical, and then released in pieces in the children. All this makes the code harder to read and understand, and hard to extend to limit behaviours to only logical or only vdev IO. This commit adds two new stages to the pipeline, ZIO_LOGICAL_IO_START and ZIO_LOGICAL_IO_DONE to handle this IO. This allows a clean separation between logical and vdev IO: vdev IO always has io_vd set, an io_child_type of ZIO_CHILD_VDEV, while logical IO is the inverse. Logical IO only ever goes throught through the LOGICAL_IO pipeline, and vdev IO through VDEV_IO. This separation presents a new problem, in that previously the logical IO would call into the mirror vdev ops to issue the vdev IO, which is now not possible because non-vdev IOs can't use vdev operations. To keep the overall pipeline tidy, we press the root vdev into service. zio_logical_io_start() creates a child IO against spa_root_vdev, which then delegates to the mirror vdev ops to do its work. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris <[email protected]>
1 parent e0ef4d2 commit f4480ee

File tree

6 files changed

+310
-168
lines changed

6 files changed

+310
-168
lines changed

include/sys/zio_impl.h

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
/*
2828
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
29-
* Copyright (c) 2024, Klara Inc.
29+
* Copyright (c) 2024, 2025, Klara, Inc.
3030
*/
3131

3232
#ifndef _ZIO_IMPL_H
@@ -156,14 +156,17 @@ enum zio_stage {
156156

157157
ZIO_STAGE_READY = 1 << 20, /* RWFCXT */
158158

159-
ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--XT */
160-
ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--XT */
161-
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */
159+
ZIO_STAGE_LOGICAL_IO_START = 1 << 21, /* RW---- */
160+
ZIO_STAGE_LOGICAL_IO_DONE = 1 << 22, /* RW---- */
162161

163-
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */
164-
ZIO_STAGE_DIO_CHECKSUM_VERIFY = 1 << 25, /* -W---- */
162+
ZIO_STAGE_VDEV_IO_START = 1 << 23, /* RW--XT */
163+
ZIO_STAGE_VDEV_IO_DONE = 1 << 24, /* RW--XT */
164+
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 25, /* RW--XT */
165165

166-
ZIO_STAGE_DONE = 1 << 26 /* RWFCXT */
166+
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 26, /* R----- */
167+
ZIO_STAGE_DIO_CHECKSUM_VERIFY = 1 << 27, /* -W---- */
168+
169+
ZIO_STAGE_DONE = 1 << 28 /* RWFCXT */
167170
};
168171

169172
#define ZIO_ROOT_PIPELINE \
@@ -181,24 +184,30 @@ enum zio_stage {
181184
ZIO_STAGE_VDEV_IO_DONE | \
182185
ZIO_STAGE_VDEV_IO_ASSESS)
183186

187+
#define ZIO_LOGICAL_IO_STAGES \
188+
(ZIO_STAGE_LOGICAL_IO_START | \
189+
ZIO_STAGE_LOGICAL_IO_DONE)
190+
184191
#define ZIO_VDEV_CHILD_PIPELINE \
185192
(ZIO_VDEV_IO_STAGES | \
186193
ZIO_STAGE_DONE)
187194

188195
#define ZIO_READ_COMMON_STAGES \
189196
(ZIO_INTERLOCK_STAGES | \
190-
ZIO_VDEV_IO_STAGES | \
191197
ZIO_STAGE_CHECKSUM_VERIFY)
192198

193199
#define ZIO_READ_PHYS_PIPELINE \
194-
ZIO_READ_COMMON_STAGES
200+
(ZIO_READ_COMMON_STAGES | \
201+
ZIO_VDEV_IO_STAGES)
195202

196203
#define ZIO_READ_PIPELINE \
197204
(ZIO_READ_COMMON_STAGES | \
205+
ZIO_LOGICAL_IO_STAGES | \
198206
ZIO_STAGE_READ_BP_INIT)
199207

200208
#define ZIO_DDT_CHILD_READ_PIPELINE \
201-
ZIO_READ_COMMON_STAGES
209+
(ZIO_READ_COMMON_STAGES | \
210+
ZIO_LOGICAL_IO_STAGES)
202211

203212
#define ZIO_DDT_READ_PIPELINE \
204213
(ZIO_INTERLOCK_STAGES | \
@@ -208,21 +217,23 @@ enum zio_stage {
208217

209218
#define ZIO_WRITE_COMMON_STAGES \
210219
(ZIO_INTERLOCK_STAGES | \
211-
ZIO_VDEV_IO_STAGES | \
212220
ZIO_STAGE_ISSUE_ASYNC | \
213221
ZIO_STAGE_CHECKSUM_GENERATE)
214222

215223
#define ZIO_WRITE_PHYS_PIPELINE \
216-
ZIO_WRITE_COMMON_STAGES
224+
(ZIO_WRITE_COMMON_STAGES | \
225+
ZIO_VDEV_IO_STAGES)
217226

218227
#define ZIO_REWRITE_PIPELINE \
219228
(ZIO_WRITE_COMMON_STAGES | \
229+
ZIO_LOGICAL_IO_STAGES | \
220230
ZIO_STAGE_WRITE_COMPRESS | \
221231
ZIO_STAGE_ENCRYPT | \
222232
ZIO_STAGE_WRITE_BP_INIT)
223233

224234
#define ZIO_WRITE_PIPELINE \
225235
(ZIO_WRITE_COMMON_STAGES | \
236+
ZIO_LOGICAL_IO_STAGES | \
226237
ZIO_STAGE_WRITE_BP_INIT | \
227238
ZIO_STAGE_WRITE_COMPRESS | \
228239
ZIO_STAGE_ENCRYPT | \
@@ -235,7 +246,7 @@ enum zio_stage {
235246

236247
#define ZIO_DDT_CHILD_WRITE_PIPELINE \
237248
(ZIO_INTERLOCK_STAGES | \
238-
ZIO_VDEV_IO_STAGES | \
249+
ZIO_LOGICAL_IO_STAGES | \
239250
ZIO_STAGE_DVA_THROTTLE | \
240251
ZIO_STAGE_DVA_ALLOCATE)
241252

@@ -280,7 +291,7 @@ enum zio_stage {
280291
#define ZIO_BLOCKING_STAGES \
281292
(ZIO_STAGE_DVA_ALLOCATE | \
282293
ZIO_STAGE_DVA_CLAIM | \
283-
ZIO_STAGE_VDEV_IO_START)
294+
ZIO_STAGE_LOGICAL_IO_START)
284295

285296
extern void zio_inject_init(void);
286297
extern void zio_inject_fini(void);

man/man8/zpool-events.8

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -426,16 +426,19 @@ ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W----
426426
ZIO_STAGE_DVA_FREE:0x00040000:--F---
427427
ZIO_STAGE_DVA_CLAIM:0x00080000:---C--
428428

429-
ZIO_STAGE_READY:0x00100000:RWFCIT
429+
ZIO_STAGE_READY:0x00100000:RWFCXT
430430

431-
ZIO_STAGE_VDEV_IO_START:0x00200000:RW--XT
432-
ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT
433-
ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT
431+
ZIO_STAGE_LOGICAL_IO_START:0x00200000:RW----
432+
ZIO_STAGE_LOGICAL_IO_DONE:0x00400000:RW----
434433

435-
ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
436-
ZIO_STAGE_DIO_CHECKSUM_VERIFY:0x02000000:-W----
434+
ZIO_STAGE_VDEV_IO_START:0x00800000:RW--XT
435+
ZIO_STAGE_VDEV_IO_DONE:0x01000000:RW--XT
436+
ZIO_STAGE_VDEV_IO_ASSESS:0x02000000:RW--XT
437437

438-
ZIO_STAGE_DONE:0x04000000:RWFCXT
438+
ZIO_STAGE_CHECKSUM_VERIFY:0x04000000:R-----
439+
ZIO_STAGE_DIO_CHECKSUM_VERIFY:0x08000000:-W----
440+
441+
ZIO_STAGE_DONE:0x10000000:RWFCXT
439442
.TE
440443
.
441444
.Sh I/O FLAGS

module/zcommon/zfs_valstr.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ _VALSTR_BITFIELD_IMPL(zio_stage,
251251
{ 'D', "DF", "DVA_FREE" },
252252
{ 'D', "DC", "DVA_CLAIM" },
253253
{ 'R', "R ", "READY" },
254+
{ 'L', "LS", "LOGICAL_IO_START" },
255+
{ 'L', "LD", "LOGICAL_IO_DONE" },
254256
{ 'V', "VS", "VDEV_IO_START" },
255257
{ 'V', "VD", "VDEV_IO_DONE" },
256258
{ 'V', "VA", "VDEV_IO_ASSESS" },

module/zfs/vdev_mirror.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
/*
2828
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
29+
* Copyright (c) 2025, Klara, Inc.
2930
*/
3031

3132
#include <sys/zfs_context.h>
@@ -270,7 +271,15 @@ vdev_mirror_map_init(zio_t *zio)
270271
vdev_t *vd = zio->io_vd;
271272
int c;
272273

273-
if (vd == NULL) {
274+
ASSERT3P(vd, !=, NULL);
275+
276+
if (vd == zio->io_spa->spa_root_vdev) {
277+
/*
278+
* Special case for "root" IO handling. We create a mirror map
279+
* that points to multiple locations within the same top-level
280+
* vdev, rather than the same location on multiple vdevs.
281+
*/
282+
274283
dva_t *dva = zio->io_bp->blk_dva;
275284
spa_t *spa = zio->io_spa;
276285
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
@@ -746,7 +755,7 @@ vdev_mirror_io_done(zio_t *zio)
746755
/*
747756
* Always require at least one good copy.
748757
*
749-
* For ditto blocks (io_vd == NULL), require
758+
* For ditto blocks (root vdev), require
750759
* all copies to be good.
751760
*
752761
* XXX -- for replacing vdevs, there's no great answer.
@@ -757,7 +766,8 @@ vdev_mirror_io_done(zio_t *zio)
757766
* to be able to detach it -- which requires all
758767
* writes to the old device to have succeeded.
759768
*/
760-
if (good_copies == 0 || zio->io_vd == NULL)
769+
if (good_copies == 0 ||
770+
zio->io_vd == zio->io_spa->spa_root_vdev)
761771
zio->io_error = vdev_mirror_worst_error(mm);
762772
}
763773
return;

module/zfs/vdev_root.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
/*
2828
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
29+
* Copyright (c) 2025, Klara, Inc.
2930
*/
3031

3132
#include <sys/zfs_context.h>
@@ -142,6 +143,26 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
142143
}
143144
}
144145

146+
/*
147+
* "Root" IO just hands off to vdev_mirror, because handling multiple DVAs in
148+
* a single BP can be though of as just another kind of mirror.
149+
*/
150+
static void
151+
vdev_root_io_start(zio_t *zio)
152+
{
153+
ASSERT3U(zio->io_error, ==, 0);
154+
ASSERT3U(zio->io_child_error[ZIO_CHILD_VDEV], ==, 0);
155+
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
156+
157+
vdev_mirror_ops.vdev_op_io_start(zio);
158+
}
159+
160+
static void
161+
vdev_root_io_done(zio_t *zio)
162+
{
163+
vdev_mirror_ops.vdev_op_io_done(zio);
164+
}
165+
145166
vdev_ops_t vdev_root_ops = {
146167
.vdev_op_init = NULL,
147168
.vdev_op_fini = NULL,
@@ -151,8 +172,8 @@ vdev_ops_t vdev_root_ops = {
151172
.vdev_op_asize_to_psize = vdev_default_psize,
152173
.vdev_op_min_asize = vdev_default_min_asize,
153174
.vdev_op_min_alloc = NULL,
154-
.vdev_op_io_start = NULL, /* not applicable to the root */
155-
.vdev_op_io_done = NULL, /* not applicable to the root */
175+
.vdev_op_io_start = vdev_root_io_start,
176+
.vdev_op_io_done = vdev_root_io_done,
156177
.vdev_op_state_change = vdev_root_state_change,
157178
.vdev_op_need_resilver = NULL,
158179
.vdev_op_hold = NULL,

0 commit comments

Comments
 (0)