cortexproject · gouthamve · Jan 21, 2020 · Oct 30, 2018 · Oct 30, 2018 · Jul 10, 2019
diff --git a/Makefile b/Makefile
@@ -49,12 +49,14 @@ $(foreach exe, $(EXES), $(eval $(call dep_exe, $(exe))))
 
 # Manually declared dependencies And what goes into each exe
 pkg/ingester/client/cortex.pb.go: pkg/ingester/client/cortex.proto
+pkg/ingester/wal.pb.go: pkg/ingester/wal.proto
 pkg/ring/ring.pb.go: pkg/ring/ring.proto
 pkg/querier/frontend/frontend.pb.go: pkg/querier/frontend/frontend.proto
 pkg/querier/queryrange/queryrange.pb.go: pkg/querier/queryrange/queryrange.proto
 pkg/chunk/storage/caching_index_client.pb.go: pkg/chunk/storage/caching_index_client.proto
 pkg/distributor/ha_tracker.pb.go: pkg/distributor/ha_tracker.proto
 pkg/ruler/rules/rules.pb.go: pkg/ruler/rules/rules.proto
+
 all: $(UPTODATE_FILES)
 test: protos
 mod-check: protos

diff --git a/docs/configuration/arguments.md b/docs/configuration/arguments.md
@@ -305,6 +305,24 @@ It also talks to a KVStore and has it's own copies of the same flags used by the
    Where you don't want to cache every chunk written by ingesters, but you do want to take advantage of chunk write deduplication, this option will make ingesters write a placeholder to the cache for each chunk.
    Make sure you configure ingesters with a different cache to queriers, which need the whole value.
 
+#### WAL
+
+- `--ingester.wal-dir`
+   Directory where the WAL data should be stores and/or recovered from.
+
+- `--ingester.wal-enabled`
+
+   Setting this to `true` enables writing to WAL during ingestion.
+
+- `--ingester.checkpoint-enabled`
+   Set this to `true` to enable checkpointing of in-memory chunks to disk. This is optional which helps in speeding up the replay process.
+
+- `--ingester.checkpoint-duration` 
+   This is the interval at which checkpoints should be created.
+
+- `--ingester.recover-from-wal`
+   Set this to to `true` to recover data from an existing WAL. The data is recovered even if WAL is disabled and this is set to `true`. The WAL dir needs to be set for this.
+
 ## Runtime Configuration file
 
 Cortex has a concept of "runtime config" file, which is simply a file that is reloaded while Cortex is running. It is used by some Cortex components to allow operator to change some aspects of Cortex configuration without restarting it. File is specified by using `-runtime-config.file=<filename>` flag and reload period (which defaults to 10 seconds) can be changed by `-runtime-config.reload-period=<duration>` flag. Previously this mechanism was only used by limits overrides, and flags were called `-limits.per-user-override-config=<filename>` and `-limits.per-user-override-period=10s` respectively. These are still used, if `-runtime-config.file=<filename>` is not specified.

diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -323,6 +323,27 @@ ring:
 The `ingester_config` configures the Cortex ingester.
 
 ```yaml
+walconfig:
+  # Enable writing of ingested data into WAL.
+  # CLI flag: -ingester.wal-enabled
+  [wal_enabled: <boolean> | default = false]
+
+  # Enable checkpointing of in-memory chunks.
+  # CLI flag: -ingester.checkpoint-enabled
+  [checkpoint_enabled: <boolean> | default = false]
+
+  # Recover data from existing WAL irrespective of WAL enabled/disabled.
+  # CLI flag: -ingester.recover-from-wal
+  [recover_from_wal: <boolean> | default = false]
+
+  # Directory to store the WAL and/or recover from WAL.
+  # CLI flag: -ingester.wal-dir
+  [wal_dir: <string> | default = "wal"]
+
+  # Interval at which checkpoints should be created.
+  # CLI flag: -ingester.checkpoint-duration
+  [checkpoint_duration: <duration> | default = 30m0s]
+
 lifecycler:
   ring:
     kvstore:

diff --git a/docs/guides/ingesters-with-wal.md b/docs/guides/ingesters-with-wal.md
@@ -0,0 +1,76 @@
+---
+title: "Ingesters with WAL"
+linkTitle: "Ingesters with WAL"
+weight: 5
+slug: ingesters-with-wal
+---
+
+Currently the ingesters running in the chunks storage mode, store all their data in memory. If there is a crash, there could be loss of data. WAL helps fill this gap in reliability.
+
+To use WAL, there are some changes that needs to be made in the deployment.
+
+## Changes to deployment
+
+1. Since ingesters need to have the same persistent volume across restarts/rollout, all the ingesters should be run on [statefulset](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) with fixed volumes.
+
+2. Following flags needs to be set
+    * `--ingester.wal-dir` to the directory where the WAL data should be stores and/or recovered from. Note that this should be on the mounted volume.
+    * `--ingester.wal-enabled` to `true` which enables writing to WAL during ingestion.
+    * `--ingester.checkpoint-enabled` to `true` to enable checkpointing of in-memory chunks to disk. This is optional which helps in speeding up the replay process.
+    * `--ingester.checkpoint-duration` to the interval at which checkpoints should be created. Default is `30m`, and depending on the number of series, it can be brought down to `15m` if there are less series per ingester (say 1M).
+    * `--ingester.recover-from-wal` to `true` to recover data from an existing WAL. The data is recovered even if WAL is disabled and this is set to `true`. The WAL dir needs to be set for this.
+        * If you are going to enable WAL, it is advisable to always set this to `true`.
+    * `--ingester.tokens-file-path` should be set to the filepath where the tokens should be stored. Note that this should be on the mounted volume. Why this is required is described below.
+
+## Changes in lifecycle when WAL is enabled
+
+1. Flushing of data to chunk store during rollouts or scale down is disabled. This is because during a rollout of statefulset there are no ingesters that are simultaneously leaving and joining, rather the same ingester is shut down and brought back again with updated config. Hence flushing is skipped and the data is recovered from the WAL.
+
+2. As there are no transfers between ingesters, the tokens are stored and recovered from disk between rollout/restarts. This is [not a new thing](https://github.com/cortexproject/cortex/pull/1750) but it is effective when using statefulsets.
+
+## Migrating from stateless deployments
+
+The ingester _deployment without WAL_ and _statefulset with WAL_ should be scaled down and up respectively in sync without transfer of data between them to ensure that any ingestion after migration is reliable immediately.
+
+Let's take an example of 4 ingesters. The migration would look something like this:
+
+1. Bring up one stateful ingester `ingester-0` and wait till it's ready (accepting read and write requests).
+2. Scale down old ingester deployment to 3 and wait till the leaving ingester flushes all the data to chunk store.
+3. Once that ingester has disappeared from `kc get pods ...`, add another stateful ingester and wait till it's ready. This assures not transfer. Now you have `ingester-0 ingester-1`.
+4. Repeat step 2 to reduce remove another ingester from old deployment.
+5. Repeat step 3 to add another stateful ingester. Now you have `ingester-0 ingester-1 ingester-2`.
+6. Repeat step 4 and 5, and now you will finally have `ingester-0 ingester-1 ingester-2 ingester-3`.
+
+## How to scale up/down
+
+### Scale up
+
+Scaling up is same as what you would do without WAL or statefulsets. Nothing to change here.
+
+### Scale down
+
+Since Kubernetes doesn't differentiate between rollout and scale down when sending a signal, the flushing of chunks is disabled by default. Hence the only thing to take care during scale down is flushing of chunks.
+
+There are 2 ways to do it, with the latter being a fallback option.
+
+**First option**
+Consider you have 4 ingesters `ingester-0 ingester-1 ingester-2 ingester-3` and you want to scale down to 2 ingesters, the ingesters which will be shutdown according to statefulset rules are `ingester-3` and then `ingester-2`.
+
+Hence before actually scaling down in Kubernetes, port forward those ingesters and hit the [`/shutdown`](https://github.com/cortexproject/cortex/pull/1746) endpoint. This will flush the chunks and shut down the ingesters (while also removing itself from the ring).
+
+After hitting the endpoint for `ingester-2 ingester-3`, scale down the ingesters to 2.
+
+PS: Given you have to scale down 1 ingester at a time, you can pipeline the shutdown and scaledown process instead of hitting shutdown endpoint for all to-be-scaled-down ingesters at the same time.
+
+**Fallback option**
+
+There is a [flush mode ingester](https://github.com/cortexproject/cortex/pull/1747) in progress, and with recent discussions there will be a separate target called flusher in it's place.
+
+You can run it as a kubernetes job which will 
+* Attach to the volume of the scaled down ingester
+* Recover from the WAL
+* And flush all the chunks. 
+
+This job is to be run for all the ingesters that you missed hitting the shutdown endpoint as a first option.
+
+More info about the flusher target will be added once it's upstream.
diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go
@@ -188,7 +188,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove
 	if !canJoinDistributorsRing {
 		ingestionRateStrategy = newInfiniteIngestionRateStrategy()
 	} else if limits.IngestionRateStrategy() == validation.GlobalIngestionRateStrategy {
-		distributorsRing, err = ring.NewLifecycler(cfg.DistributorRing.ToLifecyclerConfig(), nil, "distributor", ring.DistributorRingKey)
+		distributorsRing, err = ring.NewLifecycler(cfg.DistributorRing.ToLifecyclerConfig(), nil, "distributor", ring.DistributorRingKey, true)
 		if err != nil {
 			return nil, err
 		}

diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go
@@ -35,8 +35,14 @@ const (
 	queryStreamBatchSize = 128
 )
 
+var (
+	// This is initialised if the WAL is enabled and the records are fetched from this pool.
+	recordPool sync.Pool
+)
+
 // Config for an Ingester.
 type Config struct {
+	WALConfig        WALConfig
 	LifecyclerConfig ring.LifecyclerConfig `yaml:"lifecycler,omitempty"`
 
 	// Config for transferring chunks. Zero or negative = no retries.
@@ -70,6 +76,7 @@ type Config struct {
 // RegisterFlags adds the flags required to config this to the given FlagSet
 func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	cfg.LifecyclerConfig.RegisterFlags(f)
+	cfg.WALConfig.RegisterFlags(f)
 
 	f.IntVar(&cfg.MaxTransferRetries, "ingester.max-transfer-retries", 10, "Number of times to try and transfer chunks before falling back to flushing. Negative value or zero disables hand-over.")
 	f.DurationVar(&cfg.FlushCheckPeriod, "ingester.flush-period", 1*time.Minute, "Period with which to attempt to flush chunks.")
@@ -109,6 +116,9 @@ type Ingester struct {
 	flushQueues     []*util.PriorityQueue
 	flushQueuesDone sync.WaitGroup
 
+	// This should never be nil.
+	wal WAL
+
 	// Hook for injecting behaviour from tests.
 	preFlushUserSeries func()
 
@@ -131,6 +141,19 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c
 		return NewV2(cfg, clientConfig, limits, registerer)
 	}
 
+	if cfg.WALConfig.walEnabled {
+		// If WAL is enabled, we don't transfer out the data to any ingester.
+		// Either the next ingester which takes it's place should recover from WAL
+		// or the data has to be flushed during scaledown.
+		cfg.MaxTransferRetries = 0
+
+		recordPool = sync.Pool{
+			New: func() interface{} {
+				return &Record{}
+			},
+		}
+	}
+
 	i := &Ingester{
 		cfg:          cfg,
 		clientConfig: clientConfig,
@@ -142,14 +165,36 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c
 	}
 
 	var err error
-	i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey)
+	// During WAL recovery, it will create new user states which requires the limiter.
+	// Hence initialise the limiter before creating the WAL.
+	// The '!cfg.WALConfig.walEnabled' argument says don't flush on shutdown if the WAL is enabled.
+	i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey, !cfg.WALConfig.walEnabled)
 	if err != nil {
 		return nil, err
 	}
-
-	// Init the limter and instantiate the user states which depend on it
 	i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels)
-	i.userStates = newUserStates(i.limiter, cfg, i.metrics)
+
+	if cfg.WALConfig.recover {
+		level.Info(util.Logger).Log("msg", "recovering from WAL")
+		start := time.Now()
+		if err := recoverFromWAL(i); err != nil {
+			level.Error(util.Logger).Log("msg", "failed to recover from WAL", "time", time.Since(start).String())
+			return nil, err
+		}
+		elapsed := time.Since(start)
+		level.Info(util.Logger).Log("msg", "recovery from WAL completed", "time", elapsed.String())
+		i.metrics.walReplayDuration.Set(elapsed.Seconds())
+	}
+
+	// If the WAL recover happened, then the userStates would already be set.
+	if i.userStates == nil {
+		i.userStates = newUserStates(i.limiter, cfg, i.metrics)
+	}
+
+	i.wal, err = newWAL(cfg.WALConfig, i.userStates.cp)
+	if err != nil {
+		return nil, err
+	}
 
 	// Now that user states have been created, we can start the lifecycler
 	i.lifecycler.Start()
@@ -200,6 +245,8 @@ func (i *Ingester) Shutdown() {
 		close(i.quit)
 		i.done.Wait()
 
+		i.wal.Stop()
+
 		// Next initiate our graceful exit from the ring.
 		i.lifecycler.Shutdown()
 	}
@@ -209,7 +256,11 @@ func (i *Ingester) Shutdown() {
 //     * Change the state of ring to stop accepting writes.
 //     * Flush all the chunks.
 func (i *Ingester) ShutdownHandler(w http.ResponseWriter, r *http.Request) {
+	originalState := i.lifecycler.FlushOnShutdown()
+	// We want to flush the chunks if transfer fails irrespective of original flag.
+	i.lifecycler.SetFlushOnShutdown(true)
 	i.Shutdown()
+	i.lifecycler.SetFlushOnShutdown(originalState)
 	w.WriteHeader(http.StatusNoContent)
 }
 
@@ -232,11 +283,25 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client.
 	if err != nil {
 		return nil, fmt.Errorf("no user id")
 	}
+
 	var lastPartialErr *validationError
+	var record *Record
+	if i.cfg.WALConfig.walEnabled {
+		record = recordPool.Get().(*Record)
+		record.UserId = userID
+		// Assuming there is not much churn in most cases, there is no use
+		// keeping the record.Labels slice hanging around.
+		record.Labels = nil
+		if cap(record.Samples) < len(req.Timeseries) {
+			record.Samples = make([]Sample, 0, len(req.Timeseries))
+		} else {
+			record.Samples = record.Samples[:0]
+		}
+	}
 
 	for _, ts := range req.Timeseries {
 		for _, s := range ts.Samples {
-			err := i.append(ctx, userID, ts.Labels, model.Time(s.TimestampMs), model.SampleValue(s.Value), req.Source)
+			err := i.append(ctx, userID, ts.Labels, model.Time(s.TimestampMs), model.SampleValue(s.Value), req.Source, record)
 			if err == nil {
 				continue
 			}
@@ -254,10 +319,19 @@ func (i *Ingester) Push(ctx old_ctx.Context, req *client.WriteRequest) (*client.
 	if lastPartialErr != nil {
 		return &client.WriteResponse{}, lastPartialErr.WrapWithUser(userID).WrappedError()
 	}
+
+	if record != nil {
+		// Log the record only if there was no error in ingestion.
+		if err := i.wal.Log(record); err != nil {
+			return nil, err
+		}
+		recordPool.Put(record)
+	}
+
 	return &client.WriteResponse{}, nil
 }
 
-func (i *Ingester) append(ctx context.Context, userID string, labels labelPairs, timestamp model.Time, value model.SampleValue, source client.WriteRequest_SourceEnum) error {
+func (i *Ingester) append(ctx context.Context, userID string, labels labelPairs, timestamp model.Time, value model.SampleValue, source client.WriteRequest_SourceEnum, record *Record) error {
 	labels.removeBlanks()
 
 	var (
@@ -274,7 +348,8 @@ func (i *Ingester) append(ctx context.Context, userID string, labels labelPairs,
 	if i.stopped {
 		return fmt.Errorf("ingester stopping")
 	}
-	state, fp, series, err := i.userStates.getOrCreateSeries(ctx, userID, labels)
+
+	state, fp, series, err := i.userStates.getOrCreateSeries(ctx, userID, labels, record)
 	if err != nil {
 		if ve, ok := err.(*validationError); ok {
 			state.discardedSamples.WithLabelValues(ve.errorType).Inc()
@@ -310,6 +385,14 @@ func (i *Ingester) append(ctx context.Context, userID string, labels labelPairs,
 		return err
 	}
 
+	if record != nil {
+		record.Samples = append(record.Samples, Sample{
+			Fingerprint: uint64(fp),
+			Timestamp:   uint64(timestamp),
+			Value:       float64(value),
+		})
+	}
+
 	memoryChunks.Add(float64(len(series.chunkDescs) - prevNumChunks))
 	i.metrics.ingestedSamples.Inc()
 	switch source {
@@ -430,7 +513,7 @@ func (i *Ingester) QueryStream(req *client.QueryRequest, stream client.Ingester_
 		}
 
 		numSeries++
-		wireChunks, err := toWireChunks(chunks)
+		wireChunks, err := toWireChunks(chunks, nil)
 		if err != nil {
 			return err
 		}