Skip to content

Commit e8aed00

Browse files
authored
Merge pull request #29774 from redpanda-data/ct/core-15645/l0-gc-reset
[CORE-15645] Cloud Topics: Level Zero GC force reset (escape hatch)
2 parents daca49a + 7266d4c commit e8aed00

11 files changed

Lines changed: 659 additions & 306 deletions

File tree

proto/redpanda/core/admin/internal/cloud_topics/v1/level_zero.proto

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,32 @@ option (pbgen.cpp_namespace) = "proto::admin::level_zero";
2525
// Admin endpoints to query Level-0 garbage collection workers for Cloud Topics.
2626

2727
service LevelZeroService {
28-
// Returns GC worker status for each shard on targeted node(s).
28+
// Returns GC worker status for each shard on specified node (if present) or
29+
// whole cluster.
2930
rpc GetStatus(GetStatusRequest) returns (GetStatusResponse) {
3031
option (pbgen.rpc) = {
3132
authz: SUPERUSER
3233
};
3334
}
3435

35-
// Starts GC workers on targeted node(s).
36-
rpc Start(StartRequest) returns (StartResponse) {
36+
// Starts GC workers on a specific node.
37+
rpc StartGc(StartGcRequest) returns (StartGcResponse) {
3738
option (pbgen.rpc) = {
3839
authz: SUPERUSER
3940
};
4041
}
4142

42-
// Pauses GC workers on targeted node(s).
43-
rpc Pause(PauseRequest) returns (PauseResponse) {
43+
// Pauses GC workers on a specific node.
44+
rpc PauseGc(PauseGcRequest) returns (PauseGcResponse) {
45+
option (pbgen.rpc) = {
46+
authz: SUPERUSER
47+
};
48+
}
49+
50+
// Resets GC worker internal state on the specified node.
51+
// Drains in-flight operations and clears iteration state.
52+
// GC resumes automatically if it was running before the reset.
53+
rpc ResetGc(ResetGcRequest) returns (ResetGcResponse) {
4454
option (pbgen.rpc) = {
4555
authz: SUPERUSER
4656
};
@@ -95,41 +105,32 @@ message ShardStatus {
95105
Status status = 2;
96106
}
97107

98-
// Request to start GC workers.
99-
message StartRequest {
100-
// Target a specific node, or omit to start on all nodes.
108+
// Request to start GC workers on a specific node.
109+
message StartGcRequest {
110+
// Target node. If omitted, the receiving node handles the request.
101111
optional int32 node_id = 1;
102112
}
103113

104-
// Aggregated start results from one or more nodes.
105-
message StartResponse {
106-
repeated StartResult results = 1;
107-
}
114+
// Empty on success; RPC error on failure.
115+
message StartGcResponse {}
108116

109-
// Request to pause GC workers.
110-
message PauseRequest {
111-
// Target a specific node, or omit to pause on all nodes.
117+
// Request to pause GC workers on a specific node.
118+
message PauseGcRequest {
119+
// Target node. If omitted, the receiving node handles the request.
112120
optional int32 node_id = 1;
113121
}
114122

115-
// Aggregated pause results from one or more nodes.
116-
message PauseResponse {
117-
repeated PauseResult results = 1;
118-
}
123+
// Empty on success; RPC error on failure.
124+
message PauseGcResponse {}
119125

120-
// Outcome of starting GC on a single node.
121-
message StartResult {
122-
int32 node_id = 1;
123-
// Set if the node could not be reached or returned an error.
124-
optional string error = 3;
126+
// Request to reset GC worker state on a specific node.
127+
message ResetGcRequest {
128+
// Target node. If omitted, the receiving node handles the request.
129+
optional int32 node_id = 1;
125130
}
126131

127-
// Outcome of pausing GC on a single node.
128-
message PauseResult {
129-
int32 node_id = 1;
130-
// Set if the node could not be reached or returned an error.
131-
optional string error = 3;
132-
}
132+
// Empty on success; RPC error on failure.
133+
message ResetGcResponse {}
133134

134135
// Request to advance a partition to the current cluster epoch.
135136
message AdvanceEpochRequest {
@@ -172,6 +173,7 @@ enum Status {
172173
L0_GC_STATUS_RUNNING = 2;
173174
L0_GC_STATUS_STOPPING = 3;
174175
L0_GC_STATUS_STOPPED = 4;
176+
L0_GC_STATUS_RESETTING = 5;
175177
}
176178

177179
// Request the L0 size estimate for a single partition.

src/v/cloud_topics/level_zero/gc/level_zero_gc.cc

Lines changed: 89 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,19 @@
2828
#include <seastar/core/sleep.hh>
2929
#include <seastar/coroutine/as_future.hh>
3030

31+
#include <memory>
32+
33+
namespace {
34+
constexpr ss::lowres_clock::duration control_timeout = 5s;
35+
}
36+
3137
namespace cloud_topics {
3238

3339
class level_zero_gc::list_delete_worker {
40+
static constexpr auto handle_worker_exc = [](std::exception_ptr eptr) {
41+
vlog(cd_log.warn, "Exception from delete worker: {}", eptr);
42+
};
43+
3444
public:
3545
explicit list_delete_worker(
3646
std::unique_ptr<object_storage> storage,
@@ -39,9 +49,7 @@ class level_zero_gc::list_delete_worker {
3949
: storage_(std::move(storage))
4050
, node_info_(std::move(node_info))
4151
, probe_(&probe)
42-
, worker_([](std::exception_ptr eptr) {
43-
vlog(cd_log.warn, "Exception from delete worker: {}", eptr);
44-
}) {}
52+
, worker_(std::make_unique<ssx::work_queue>(handle_worker_exc)) {}
4553
void start() {
4654
vlog(cd_log.info, "Starting cloud topics list/delete worker");
4755
if (as_.abort_requested()) {
@@ -63,11 +71,40 @@ class level_zero_gc::list_delete_worker {
6371
as_.request_abort();
6472
delete_sem_.broken();
6573
page_sem_.broken();
66-
co_await worker_.shutdown();
74+
co_await worker_->shutdown();
6775
co_await gate_.close();
6876
vlog(cd_log.info, "Stopped cloud topics list/delete worker");
6977
}
7078

79+
seastar::future<> reset() {
80+
if (gate_.is_closed()) {
81+
co_return;
82+
}
83+
vlog(cd_log.info, "Resetting cloud topics list/delete worker");
84+
85+
// Abort in-flight list/delete operations
86+
as_.request_abort();
87+
88+
// Drain pending delete tasks
89+
co_await worker_->shutdown();
90+
91+
// Wait for spawned delete fibers to complete
92+
if (!gate_.is_closed()) {
93+
co_await gate_.close();
94+
}
95+
96+
continuation_token_.reset();
97+
curr_prefix_.reset();
98+
key_prefixes_.set_range(std::nullopt);
99+
100+
as_ = {};
101+
gate_ = {};
102+
103+
worker_ = std::make_unique<ssx::work_queue>(handle_worker_exc);
104+
105+
vlog(cd_log.info, "Reset cloud topics list/delete worker");
106+
}
107+
71108
bool has_capacity() const { return page_sem_.available_units() > 0; }
72109

73110
seastar::future<std::expected<
@@ -110,9 +147,9 @@ class level_zero_gc::list_delete_worker {
110147
// unbounded.
111148
u.emplace(seastar::consume_units(page_sem_, keys_total_bytes));
112149
}
113-
worker_.submit([this,
114-
o = std::move(objects),
115-
u = std::move(u).value()]() mutable {
150+
worker_->submit([this,
151+
o = std::move(objects),
152+
u = std::move(u).value()]() mutable {
116153
return do_delete_objects(std::move(o), std::move(u));
117154
});
118155
}
@@ -222,7 +259,7 @@ class level_zero_gc::list_delete_worker {
222259
std::unique_ptr<object_storage> storage_;
223260
std::unique_ptr<node_info> node_info_;
224261
level_zero_gc_probe* probe_;
225-
ssx::work_queue worker_;
262+
std::unique_ptr<ssx::work_queue> worker_;
226263
// TODO: configurable limits?
227264
// max number of in-flight delete ops
228265
ssx::semaphore delete_sem_{5, "ct/gc/delete"};
@@ -621,14 +658,22 @@ level_zero_gc::level_zero_gc(
621658

622659
level_zero_gc::~level_zero_gc() = default;
623660

624-
void level_zero_gc::start() {
661+
seastar::future<> level_zero_gc::start() {
662+
while (resetting_) {
663+
co_await reset_cv_.wait(
664+
control_timeout, [this] { return !resetting_; });
665+
}
625666
vlog(cd_log.info, "Starting cloud topics L0 GC worker");
626667
delete_worker_->start();
627668
should_run_ = true;
628669
worker_cv_.signal();
629670
}
630671

631-
void level_zero_gc::pause() {
672+
seastar::future<> level_zero_gc::pause() {
673+
while (resetting_) {
674+
co_await reset_cv_.wait(
675+
control_timeout, [this] { return !resetting_; });
676+
}
632677
vlog(cd_log.info, "Pausing cloud topics L0 GC worker");
633678
should_run_ = false;
634679
asrc_.request_abort();
@@ -645,13 +690,44 @@ seastar::future<> level_zero_gc::stop() {
645690
vlog(cd_log.info, "Stopped cloud_topics L0 GC worker");
646691
}
647692

693+
seastar::future<> level_zero_gc::reset() {
694+
if (should_shutdown_ || resetting_) {
695+
co_return;
696+
}
697+
vlog(cd_log.info, "Resetting cloud topics L0 GC worker state");
698+
699+
resetting_ = true;
700+
const bool was_running = should_run_;
701+
702+
auto done = ss::defer([this] {
703+
resetting_ = false;
704+
reset_cv_.broadcast();
705+
});
706+
707+
// Pause the outer worker loop so it blocks on the CV
708+
should_run_ = false;
709+
asrc_.request_abort();
710+
711+
co_await delete_worker_->reset();
712+
713+
// Resume if was running, then clear the flag so that start()/pause()
714+
// waiting on reset_cv_ don't race with the resume.
715+
if (was_running && !should_shutdown_) {
716+
delete_worker_->start();
717+
should_run_ = true;
718+
worker_cv_.signal();
719+
}
720+
}
721+
648722
std::string_view to_string_view(level_zero_gc::state s) {
649723
switch (s) {
650724
using enum level_zero_gc::state;
651725
case paused:
652726
return "level_zero_gc::state::paused";
653727
case running:
654728
return "level_zero_gc::state::running";
729+
case resetting:
730+
return "level_zero_gc::state::resetting";
655731
case stopping:
656732
return "level_zero_gc::state::stopping";
657733
case stopped:
@@ -667,6 +743,9 @@ auto level_zero_gc::get_state() const -> state {
667743
if (should_shutdown_) {
668744
return worker_.available() ? state::stopped : state::stopping;
669745
}
746+
if (resetting_) {
747+
return state::resetting;
748+
}
670749
return should_run_ ? state::running : state::paused;
671750
}();
672751
vlog(cd_log.debug, "cloud_topics L0 GC worker state: {}", st);

src/v/cloud_topics/level_zero/gc/level_zero_gc.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,18 @@ class level_zero_gc {
281281
/*
282282
* Request that GC be started or paused. These can be called multiple times
283283
* and in any order. The last invocation will eventually take effect.
284+
*
285+
* If a reset is in progress, these will block until it completes or
286+
* control_timeout expires.
284287
*/
285-
void start();
286-
void pause();
288+
seastar::future<> start();
289+
seastar::future<> pause();
290+
291+
/// Reset internal GC state without a full stop/start cycle.
292+
/// Drains in-flight operations, clears pagination and prefix
293+
/// iteration state, and prepares for a fresh collection sweep.
294+
/// GC resumes automatically if it was running before the reset.
295+
seastar::future<> reset();
287296

288297
/*
289298
* Request and wait for GC to be completely stopped. After calling shutdown,
@@ -296,12 +305,14 @@ class level_zero_gc {
296305
*
297306
* - paused: Paused indefinitely, call start() to run
298307
* - running: GC will run until paused or stopped
308+
* - resetting: reset() is draining in-flight work
299309
* - stopping: stop() requested but there may be work still in flight
300310
* - stopped: Permanently stopped.
301311
*/
302312
enum class state : uint8_t {
303313
paused,
304314
running,
315+
resetting,
305316
stopping,
306317
stopped,
307318
};
@@ -317,9 +328,11 @@ class level_zero_gc {
317328

318329
bool should_run_;
319330
bool should_shutdown_;
331+
bool resetting_{false};
320332
seastar::abort_source asrc_;
321333
seastar::condition_variable worker_cv_;
322334
seastar::future<> worker_;
335+
seastar::condition_variable reset_cv_;
323336

324337
seastar::future<> worker();
325338
enum class collection_error : int8_t;

src/v/cloud_topics/level_zero/gc/tests/level_zero_gc_mt_test.cc

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,12 +210,13 @@ struct level_zero_gc_mt_test : public seastar_test {
210210
}
211211

212212
// Add objects with various prefixes (call from shard 0 context)
213-
void populate_objects(size_t count) {
213+
void populate_objects(size_t count, bool dynamic_epoch = false) {
214214
g_bucket_state->objects.reserve(count);
215215
for (size_t i = 0; i < count; ++i) {
216216
auto prefix = static_cast<object_id::prefix_t>(i % 1000);
217217
auto id = object_id{
218-
.epoch = cluster_epoch(1),
218+
.epoch = cluster_epoch(
219+
dynamic_epoch ? static_cast<int64_t>(i) : 1),
219220
.name = uuid_t::create(),
220221
.prefix = prefix,
221222
};
@@ -307,6 +308,38 @@ TEST_F_CORO(level_zero_gc_mt_test, no_eligible_epoch) {
307308
EXPECT_EQ(get_total_deleted(), 0);
308309
}
309310

311+
/*
312+
* Concurrent reset/start/pause cycles don't crash or corrupt state.
313+
*/
314+
TEST_F_CORO(level_zero_gc_mt_test, concurrent_reset_start_pause) {
315+
populate_objects(num_objects, true /* dynamic_epoch */);
316+
set_max_epoch(num_objects / 2 - 1);
317+
318+
co_await gc_.invoke_on_all(&level_zero_gc::start);
319+
co_await ss::sleep(100ms);
320+
321+
std::vector<ss::future<>> futs;
322+
323+
for (int i = 0; i < 10; ++i) {
324+
futs.push_back(
325+
gc_.invoke_on_all([](level_zero_gc& gc) { return gc.reset(); }));
326+
futs.push_back(
327+
gc_.invoke_on_all([](level_zero_gc& gc) { return gc.reset(); }));
328+
futs.push_back(gc_.invoke_on_all(&level_zero_gc::start));
329+
futs.push_back(gc_.invoke_on_all(&level_zero_gc::pause));
330+
futs.push_back(gc_.invoke_on_all(&level_zero_gc::start));
331+
}
332+
333+
co_await ss::when_all_succeed(std::move(futs));
334+
335+
co_await gc_.invoke_on_all(&level_zero_gc::start);
336+
337+
set_max_epoch(num_objects);
338+
339+
RPTEST_REQUIRE_EVENTUALLY_CORO(
340+
5s, [this] { return get_total_deleted() == num_objects; });
341+
}
342+
310343
/*
311344
* Test start/pause/start cycle works correctly.
312345
*/

0 commit comments

Comments
 (0)