Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ etcd-client = "0.12"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "1bd2398b686e5ac6c1eef6daf615867ce27f75c1" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c00c73b76ed20a603e4d29fb96cb4e6ef987bba7" }
humantime = "2.1"
humantime-serde = "1.1"
itertools = "0.10"
Expand Down
6 changes: 6 additions & 0 deletions src/common/meta/src/rpc/procedure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ pub fn procedure_state_to_pb_response(state: &ProcedureState) -> PbProcedureStat
ProcedureState::Done { .. } => (PbProcedureStatus::Done, String::default()),
ProcedureState::Retrying { error } => (PbProcedureStatus::Retrying, error.to_string()),
ProcedureState::Failed { error } => (PbProcedureStatus::Failed, error.to_string()),
ProcedureState::PrepareRollback { error } => {
(PbProcedureStatus::PrepareRollback, error.to_string())
}
ProcedureState::RollingBack { error } => {
(PbProcedureStatus::RollingBack, error.to_string())
}
};

PbProcedureStateResponse {
Expand Down
20 changes: 19 additions & 1 deletion src/common/procedure/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,24 @@ pub enum Error {
location: Location,
},

#[snafu(display("Rollback Procedure recovered: {error}"))]
RollbackProcedureRecovered { error: String, location: Location },

#[snafu(display("Procedure retry exceeded max times, procedure_id: {}", procedure_id))]
RetryTimesExceeded {
source: Arc<Error>,
procedure_id: ProcedureId,
},

#[snafu(display(
"Procedure rollback exceeded max times, procedure_id: {}",
procedure_id
))]
RollbackTimesExceeded {
source: Arc<Error>,
procedure_id: ProcedureId,
},

#[snafu(display("Corrupted data, error: "))]
CorruptedData {
#[snafu(source)]
Expand Down Expand Up @@ -145,6 +157,9 @@ pub enum Error {

#[snafu(display("Unexpected: {err_msg}"))]
Unexpected { location: Location, err_msg: String },

#[snafu(display("Not support to rollback the procedure"))]
RollbackNotSupported { location: Location },
}

pub type Result<T> = std::result::Result<T, Error>;
Expand All @@ -161,9 +176,12 @@ impl ErrorExt for Error {
| Error::DeleteState { .. }
| Error::FromJson { .. }
| Error::RetryTimesExceeded { .. }
| Error::RollbackTimesExceeded { .. }
| Error::RetryLater { .. }
| Error::WaitWatcher { .. }
| Error::ManagerNotStart { .. } => StatusCode::Internal,
| Error::ManagerNotStart { .. }
| Error::RollbackProcedureRecovered { .. }
| Error::RollbackNotSupported { .. } => StatusCode::Internal,
Error::LoaderConflict { .. } | Error::DuplicateProcedure { .. } => {
StatusCode::InvalidArguments
}
Expand Down
117 changes: 87 additions & 30 deletions src/common/procedure/src/local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ use tokio::sync::{Mutex as TokioMutex, Notify};

use self::rwlock::KeyRwLock;
use crate::error::{
DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, Result,
self, DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, Result,
StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu,
};
use crate::local::runner::Runner;
use crate::procedure::BoxedProcedureLoader;
use crate::procedure::{BoxedProcedureLoader, InitProcedureState};
use crate::store::{ProcedureMessage, ProcedureStore, StateStoreRef};
use crate::{
BoxedProcedure, ContextProvider, LockKey, ProcedureId, ProcedureManager, ProcedureState,
Expand Down Expand Up @@ -72,8 +72,13 @@ pub(crate) struct ProcedureMeta {
}

impl ProcedureMeta {
fn new(id: ProcedureId, parent_id: Option<ProcedureId>, lock_key: LockKey) -> ProcedureMeta {
let (state_sender, state_receiver) = watch::channel(ProcedureState::Running);
fn new(
id: ProcedureId,
procedure_state: ProcedureState,
parent_id: Option<ProcedureId>,
lock_key: LockKey,
) -> ProcedureMeta {
let (state_sender, state_receiver) = watch::channel(procedure_state);
ProcedureMeta {
id,
parent_id,
Expand Down Expand Up @@ -424,12 +429,18 @@ impl LocalManager {
fn submit_root(
&self,
procedure_id: ProcedureId,
procedure_state: ProcedureState,
step: u32,
procedure: BoxedProcedure,
) -> Result<Watcher> {
ensure!(self.manager_ctx.running(), ManagerNotStartSnafu);

let meta = Arc::new(ProcedureMeta::new(procedure_id, None, procedure.lock_key()));
let meta = Arc::new(ProcedureMeta::new(
procedure_id,
procedure_state,
None,
procedure.lock_key(),
));
let runner = Runner {
meta: meta.clone(),
procedure,
Expand Down Expand Up @@ -468,13 +479,11 @@ impl LocalManager {
Ok(watcher)
}

/// Recovers unfinished procedures and reruns them.
async fn recover(&self) -> Result<()> {
logging::info!("LocalManager start to recover");
let recover_start = Instant::now();

let (messages, finished_ids) = self.procedure_store.load_messages().await?;

fn submit_recovered_messages(
&self,
messages: HashMap<ProcedureId, ProcedureMessage>,
init_state: InitProcedureState,
) {
for (procedure_id, message) in &messages {
if message.parent_id.is_none() {
// This is the root procedure. We only submit the root procedure as it will
Expand All @@ -494,15 +503,40 @@ impl LocalManager {
loaded_procedure.step
);

let procedure_state = match init_state {
InitProcedureState::RollingBack => ProcedureState::RollingBack {
error: Arc::new(
error::RollbackProcedureRecoveredSnafu {
error: message.error.clone().unwrap_or("Unknown error".to_string()),
}
.build(),
),
},
InitProcedureState::Running => ProcedureState::Running,
};

if let Err(e) = self.submit_root(
*procedure_id,
procedure_state,
loaded_procedure.step,
loaded_procedure.procedure,
) {
logging::error!(e; "Failed to recover procedure {}", procedure_id);
}
}
}
}

/// Recovers unfinished procedures and reruns them.
Comment thread
WenyXu marked this conversation as resolved.
async fn recover(&self) -> Result<()> {
logging::info!("LocalManager start to recover");
let recover_start = Instant::now();

let (messages, rollback_messages, finished_ids) =
self.procedure_store.load_messages().await?;
// Submits recovered messages first.
self.submit_recovered_messages(rollback_messages, InitProcedureState::RollingBack);
self.submit_recovered_messages(messages, InitProcedureState::Running);

if !finished_ids.is_empty() {
logging::info!(
Expand Down Expand Up @@ -587,7 +621,12 @@ impl ProcedureManager for LocalManager {
DuplicateProcedureSnafu { procedure_id }
);

self.submit_root(procedure.id, 0, procedure.procedure)
self.submit_root(
procedure.id,
ProcedureState::Running,
0,
procedure.procedure,
)
}

async fn procedure_state(&self, procedure_id: ProcedureId) -> Result<Option<ProcedureState>> {
Expand Down Expand Up @@ -626,7 +665,12 @@ pub(crate) mod test_util {
use super::*;

pub(crate) fn procedure_meta_for_test() -> ProcedureMeta {
ProcedureMeta::new(ProcedureId::random(), None, LockKey::default())
ProcedureMeta::new(
ProcedureId::random(),
ProcedureState::Running,
None,
LockKey::default(),
)
}

pub(crate) fn new_object_store(dir: &TempDir) -> ObjectStore {
Expand Down Expand Up @@ -914,6 +958,14 @@ mod tests {
}
}

async fn rollback(&mut self, _: &Context) -> Result<()> {
Ok(())
}

fn rollback_supported(&self) -> bool {
true
}

fn dump(&self) -> Result<String> {
Ok(String::new())
}
Expand All @@ -923,24 +975,29 @@ mod tests {
}
}

let check_procedure = |procedure| {
async {
let procedure_id = ProcedureId::random();
let mut watcher = manager
.submit(ProcedureWithId {
id: procedure_id,
procedure: Box::new(procedure),
})
.await
.unwrap();
// Wait for the notification.
watcher.changed().await.unwrap();
assert!(watcher.borrow().is_failed());
}
let check_procedure = |procedure| async {
let procedure_id = ProcedureId::random();
manager
.submit(ProcedureWithId {
id: procedure_id,
procedure: Box::new(procedure),
})
.await
.unwrap()
};

check_procedure(MockProcedure { panic: false }).await;
check_procedure(MockProcedure { panic: true }).await;
let mut watcher = check_procedure(MockProcedure { panic: false }).await;
// Wait for the notification.
watcher.changed().await.unwrap();
assert!(watcher.borrow().is_prepare_rollback());
watcher.changed().await.unwrap();
assert!(watcher.borrow().is_rolling_back());
watcher.changed().await.unwrap();
assert!(watcher.borrow().is_failed());
// The runner won't rollback a panicked procedure.
let mut watcher = check_procedure(MockProcedure { panic: true }).await;
watcher.changed().await.unwrap();
assert!(watcher.borrow().is_failed());
}

#[tokio::test]
Expand Down
Loading