Skip to content

Commit f0de9b6

Browse files
backend/local: Periodically persist intermediate state snapshots
Terraform Core emits a hook event every time it writes a change into the in-memory state. Previously the local backend would just copy that into the transient storage of the state manager, but for most state storage implementations that doesn't really do anything useful because it just makes another copy of the state in memory. We originally added this hook mechanism with the intent of making Terraform _persist_ the state each time, but we backed that out after finding that it was a bit too aggressive and was making the state snapshot history much harder to use in storage systems that can preserve historical snapshots. However, sometimes Terraform gets killed mid-apply for whatever reason and in our previous implementation that meant always losing that transient state, forcing the user to edit the state manually (or use "import") to recover a useful state. In an attempt at finding a sweet spot between these extremes, here we change the rule so that if an apply runs for longer than 20 seconds then we'll try to persist the state to the backend in an update that arrives at least 20 seconds after the first update, and then again for each additional 20 second period as long as Terraform keeps announcing new state snapshots. This also introduces a special interruption mode where if the apply phase gets interrupted by SIGINT (or equivalent) then the local backend will try to persist the state immediately in anticipation of a possibly-imminent SIGKILL, and will then immediately persist any subsequent state update that arrives until the apply phase is complete. After interruption Terraform will not start any new operations and will instead just let any already-running operations run to completion, and so this will persist the state once per resource instance that is able to complete before being killed. This does mean that now long-running applies will generate intermediate state snapshots where they wouldn't before, but there should still be considerably fewer snapshots than were created when we were persisting for each individual state change. We can adjust the 20 second interval in future commits if we find that this spot isn't as sweet as first assumed.
1 parent 47fd019 commit f0de9b6

File tree

10 files changed

+401
-0
lines changed

10 files changed

+401
-0
lines changed

internal/backend/local/backend_apply.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"log"
8+
"time"
89

910
"github.com/hashicorp/terraform/internal/backend"
1011
"github.com/hashicorp/terraform/internal/command/views"
@@ -74,6 +75,10 @@ func (b *Local) opApply(
7475
op.ReportResult(runningOp, diags)
7576
return
7677
}
78+
// stateHook uses schemas for when it periodically persists state to the
79+
// persistent storage backend.
80+
stateHook.Schemas = schemas
81+
stateHook.PersistInterval = 20 * time.Second // arbitrary interval that's hopefully a sweet spot
7782

7883
var plan *plans.Plan
7984
// If we weren't given a plan, then we refresh/plan

internal/backend/local/hook_state.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
package local
22

33
import (
4+
"log"
45
"sync"
6+
"time"
57

68
"github.com/hashicorp/terraform/internal/states"
79
"github.com/hashicorp/terraform/internal/states/statemgr"
@@ -15,6 +17,21 @@ type StateHook struct {
1517
sync.Mutex
1618

1719
StateMgr statemgr.Writer
20+
21+
// If PersistInterval is nonzero then for any new state update after
22+
// the duration has elapsed we'll try to persist a state snapshot
23+
// to the persistent backend too.
24+
// That's only possible if field Schemas is valid, because the
25+
// StateMgr.PersistState function for some backends needs schemas.
26+
PersistInterval time.Duration
27+
28+
// Schemas are the schemas to use when persisting state due to
29+
// PersistInterval. This is ignored if PersistInterval is zero,
30+
// and PersistInterval is ignored if this is nil.
31+
Schemas *terraform.Schemas
32+
33+
lastPersist time.Time
34+
forcePersist bool
1835
}
1936

2037
var _ terraform.Hook = (*StateHook)(nil)
@@ -23,11 +40,56 @@ func (h *StateHook) PostStateUpdate(new *states.State) (terraform.HookAction, er
2340
h.Lock()
2441
defer h.Unlock()
2542

43+
if h.lastPersist.IsZero() {
44+
// The first PostStateUpdate starts the clock for intermediate
45+
// calls to PersistState.
46+
h.lastPersist = time.Now()
47+
}
48+
2649
if h.StateMgr != nil {
2750
if err := h.StateMgr.WriteState(new); err != nil {
2851
return terraform.HookActionHalt, err
2952
}
53+
if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.PersistInterval != 0 && h.Schemas != nil {
54+
if h.forcePersist || time.Since(h.lastPersist) >= h.PersistInterval {
55+
err := mgrPersist.PersistState(h.Schemas)
56+
if err != nil {
57+
return terraform.HookActionHalt, err
58+
}
59+
h.lastPersist = time.Now()
60+
}
61+
}
3062
}
3163

3264
return terraform.HookActionContinue, nil
3365
}
66+
67+
func (h *StateHook) Stopping() {
68+
h.Lock()
69+
defer h.Unlock()
70+
71+
// If Terraform has been asked to stop then that might mean that a hard
72+
// kill signal will follow shortly in case Terraform doesn't stop
73+
// quickly enough, and so we'll try to persist the latest state
74+
// snapshot in the hope that it'll give the user less recovery work to
75+
// do if they _do_ subsequently hard-kill Terraform during an apply.
76+
77+
if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.Schemas != nil {
78+
err := mgrPersist.PersistState(h.Schemas)
79+
if err != nil {
80+
// This hook can't affect Terraform Core's ongoing behavior,
81+
// but it's a best effort thing anyway so we'll just emit a
82+
// log to aid with debugging.
83+
log.Printf("[ERROR] Failed to persist state after interruption: %s", err)
84+
}
85+
86+
// While we're in the stopping phase we'll try to persist every
87+
// new state update to maximize every opportunity we get to avoid
88+
// losing track of objects that have been created or updated.
89+
// Terraform Core won't start any new operations after it's been
90+
// stopped, so at most we should see one more PostStateUpdate
91+
// call per already-active request.
92+
h.forcePersist = true
93+
}
94+
95+
}

internal/backend/local/hook_state_test.go

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
package local
22

33
import (
4+
"fmt"
45
"testing"
6+
"time"
57

8+
"github.com/google/go-cmp/cmp"
9+
"github.com/hashicorp/terraform/internal/states"
610
"github.com/hashicorp/terraform/internal/states/statemgr"
711
"github.com/hashicorp/terraform/internal/terraform"
812
)
@@ -27,3 +31,125 @@ func TestStateHook(t *testing.T) {
2731
t.Fatalf("bad state: %#v", is.State())
2832
}
2933
}
34+
35+
func TestStateHookStopping(t *testing.T) {
36+
is := &testPersistentState{}
37+
hook := &StateHook{
38+
StateMgr: is,
39+
Schemas: &terraform.Schemas{},
40+
PersistInterval: 4 * time.Hour,
41+
lastPersist: time.Now(),
42+
}
43+
44+
s := statemgr.TestFullInitialState()
45+
action, err := hook.PostStateUpdate(s)
46+
if err != nil {
47+
t.Fatalf("unexpected error from PostStateUpdate: %s", err)
48+
}
49+
if got, want := action, terraform.HookActionContinue; got != want {
50+
t.Fatalf("wrong hookaction %#v; want %#v", got, want)
51+
}
52+
if is.Written == nil || !is.Written.Equal(s) {
53+
t.Fatalf("mismatching state written")
54+
}
55+
if is.Persisted != nil {
56+
t.Fatalf("persisted too soon")
57+
}
58+
59+
// We'll now force lastPersist to be long enough ago that persisting
60+
// should be due on the next call.
61+
hook.lastPersist = time.Now().Add(-5 * time.Hour)
62+
hook.PostStateUpdate(s)
63+
if is.Written == nil || !is.Written.Equal(s) {
64+
t.Fatalf("mismatching state written")
65+
}
66+
if is.Persisted == nil || !is.Persisted.Equal(s) {
67+
t.Fatalf("mismatching state persisted")
68+
}
69+
hook.PostStateUpdate(s)
70+
if is.Written == nil || !is.Written.Equal(s) {
71+
t.Fatalf("mismatching state written")
72+
}
73+
if is.Persisted == nil || !is.Persisted.Equal(s) {
74+
t.Fatalf("mismatching state persisted")
75+
}
76+
77+
gotLog := is.CallLog
78+
wantLog := []string{
79+
// Initial call before we reset lastPersist
80+
"WriteState",
81+
82+
// Write and then persist after we reset lastPersist
83+
"WriteState",
84+
"PersistState",
85+
86+
// Final call when persisting wasn't due yet.
87+
"WriteState",
88+
}
89+
if diff := cmp.Diff(wantLog, gotLog); diff != "" {
90+
t.Fatalf("wrong call log so far\n%s", diff)
91+
}
92+
93+
// We'll reset the log now before we try seeing what happens after
94+
// we use "Stopped".
95+
is.CallLog = is.CallLog[:0]
96+
is.Persisted = nil
97+
98+
hook.Stopping()
99+
if is.Persisted == nil || !is.Persisted.Equal(s) {
100+
t.Fatalf("mismatching state persisted")
101+
}
102+
103+
is.Persisted = nil
104+
hook.PostStateUpdate(s)
105+
if is.Persisted == nil || !is.Persisted.Equal(s) {
106+
t.Fatalf("mismatching state persisted")
107+
}
108+
is.Persisted = nil
109+
hook.PostStateUpdate(s)
110+
if is.Persisted == nil || !is.Persisted.Equal(s) {
111+
t.Fatalf("mismatching state persisted")
112+
}
113+
114+
gotLog = is.CallLog
115+
wantLog = []string{
116+
// "Stopping" immediately persisted
117+
"PersistState",
118+
119+
// PostStateUpdate then writes and persists on every call,
120+
// on the assumption that we're now bailing out after
121+
// being cancelled and trying to save as much state as we can.
122+
"WriteState",
123+
"PersistState",
124+
"WriteState",
125+
"PersistState",
126+
}
127+
if diff := cmp.Diff(wantLog, gotLog); diff != "" {
128+
t.Fatalf("wrong call log once in stopping mode\n%s", diff)
129+
}
130+
}
131+
132+
type testPersistentState struct {
133+
CallLog []string
134+
135+
Written *states.State
136+
Persisted *states.State
137+
}
138+
139+
var _ statemgr.Writer = (*testPersistentState)(nil)
140+
var _ statemgr.Persister = (*testPersistentState)(nil)
141+
142+
func (sm *testPersistentState) WriteState(state *states.State) error {
143+
sm.CallLog = append(sm.CallLog, "WriteState")
144+
sm.Written = state
145+
return nil
146+
}
147+
148+
func (sm *testPersistentState) PersistState(schemas *terraform.Schemas) error {
149+
if schemas == nil {
150+
return fmt.Errorf("no schemas")
151+
}
152+
sm.CallLog = append(sm.CallLog, "PersistState")
153+
sm.Persisted = sm.Written
154+
return nil
155+
}

internal/terraform/context.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,12 @@ func (c *Context) Stop() {
193193
c.runContextCancel = nil
194194
}
195195

196+
// Notify all of the hooks that we're stopping, in case they want to try
197+
// to flush in-memory state to disk before a subsequent hard kill.
198+
for _, hook := range c.hooks {
199+
hook.Stopping()
200+
}
201+
196202
// Grab the condition var before we exit
197203
if cond := c.runCond; cond != nil {
198204
log.Printf("[INFO] terraform: waiting for graceful stop to complete")

0 commit comments

Comments
 (0)