Skip to content

Commit aa7ee88

Browse files
committed
Bugfixes and refactoring in osquery runtime
- Correctly detect when error channel is closed (potential fix for #134). Previously the logic was inverted for whether the channel was closed, so recovery was not initiated. Unit test TestOsqueryDies repros the suspected issue. - Allow logger to be set properly. - Add logging around recovery scenarios. - Check communication with both osquery and extension server in health check (previously only the extension server was checked). - Add healthcheck on interval that causes recovery on failure (Closes #141). - Do not set cmd output to ioutil.Discard. Causes a bug with cmd.Wait (see golang/go#20730)
1 parent d06b7dd commit aa7ee88

File tree

3 files changed

+94
-26
lines changed

3 files changed

+94
-26
lines changed

cmd/launcher/launcher.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ func main() {
148148

149149
// Start the osqueryd instance
150150
instance, err := osquery.LaunchOsqueryInstance(
151+
osquery.WithLogger(logger),
151152
osquery.WithOsquerydBinary(opts.osquerydPath),
152153
osquery.WithRootDirectory(rootDirectory),
153154
osquery.WithConfigPluginFlag("kolide_grpc"),

osquery/runtime.go

Lines changed: 67 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ import (
2929
type OsqueryInstance struct {
3030
*osqueryInstanceFields
3131
instanceLock sync.Mutex
32-
logger log.Logger
3332
}
3433

3534
// osqueryInstanceFields is a type which is embedded in OsqueryInstance so that
@@ -48,6 +47,7 @@ type osqueryInstanceFields struct {
4847
stdout io.Writer
4948
stderr io.Writer
5049
retries uint
50+
logger log.Logger
5151

5252
// the following are instance artifacts that are created and held as a result
5353
// of launching an osqueryd process
@@ -129,8 +129,12 @@ func createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths, confi
129129
"--force=true",
130130
"--disable_watchdog",
131131
)
132-
cmd.Stdout = stdout
133-
cmd.Stderr = stderr
132+
if stdout != nil {
133+
cmd.Stdout = stdout
134+
}
135+
if stderr != nil {
136+
cmd.Stderr = stderr
137+
}
134138

135139
return cmd, nil
136140
}
@@ -152,6 +156,14 @@ func osqueryTempDir() (string, func(), error) {
152156
// https://dave.cheney.net/2014/10/17/functional-options-for-friendly-apis
153157
type OsqueryInstanceOption func(*OsqueryInstance)
154158

159+
// WithLogger is a functional option which allows the user to pass a log.Logger
160+
// to be used for logging osquery instance status.
161+
func WithLogger(logger log.Logger) OsqueryInstanceOption {
162+
return func(i *OsqueryInstance) {
163+
i.logger = logger
164+
}
165+
}
166+
155167
// WithOsqueryExtensionPlugin is a functional option which allows the user to
156168
// declare a number of osquery plugins (ie: config plugin, logger plugin, tables,
157169
// etc) which can be loaded when calling LaunchOsqueryInstance. You can load as
@@ -267,11 +279,10 @@ func LaunchOsqueryInstance(opts ...OsqueryInstanceOption) (*OsqueryInstance, err
267279
// caller.
268280
o := &OsqueryInstance{
269281
osqueryInstanceFields: &osqueryInstanceFields{
270-
stdout: ioutil.Discard,
271-
stderr: ioutil.Discard,
272282
rmRootDirectory: func() {},
273283
errs: make(chan error),
274284
clientLock: new(sync.Mutex),
285+
logger: log.NewNopLogger(),
275286
},
276287
}
277288

@@ -436,21 +447,47 @@ func launchOsqueryInstance(o *OsqueryInstance) (*OsqueryInstance, error) {
436447
// Launch a long-running recovery goroutine which can handle various errors
437448
// that can occur
438449
go func() {
439-
// Block until an error is generated by the osqueryd process itself or the
440-
// extension manager server. We don't select, because if one element of the
441-
// runtime produces an error, it's likely that all of the other components
442-
// will produce errors as well since everything is so interconnected. For
443-
// this reason, when any error occurs, we attempt a total recovery.
444-
runtimeError, done := <-errChannel
445-
if done {
446-
return
447-
}
448-
if recoveryError := o.Recover(runtimeError); recoveryError != nil {
449-
// If we were not able to recover the osqueryd process for some reason,
450-
// kill the process and hope that the operating system scheduling
451-
// mechanism (launchd, etc) can relaunch the tool cleanly.
452-
level.Info(o.logger).Log("err", errors.Wrap(recoveryError, "could not recover the osqueryd process"))
453-
os.Exit(1)
450+
ticker := time.NewTicker(60 * time.Second)
451+
defer ticker.Stop()
452+
453+
for {
454+
needsRecovery := false
455+
select {
456+
case <-ticker.C:
457+
healthy, err := o.Healthy()
458+
if err != nil {
459+
needsRecovery = true
460+
level.Error(o.logger).Log("err", errors.Wrap(err, "checking instance health"))
461+
}
462+
if !healthy {
463+
needsRecovery = true
464+
level.Error(o.logger).Log("msg", "instance not healthy")
465+
}
466+
467+
// Block until an error is generated by the osqueryd process itself or the
468+
// extension manager server. We don't select, because if one element of the
469+
// runtime produces an error, it's likely that all of the other components
470+
// will produce errors as well since everything is so interconnected. For
471+
// this reason, when any error occurs, we attempt a total recovery.
472+
case runtimeError, open := <-errChannel:
473+
if !open {
474+
return
475+
}
476+
needsRecovery = true
477+
level.Error(o.logger).Log("err", errors.Wrap(runtimeError, "osquery runtime error"))
478+
}
479+
480+
if needsRecovery {
481+
level.Info(o.logger).Log("msg", "recovering osquery instance")
482+
if recoveryError := o.Recover(); recoveryError != nil {
483+
// If we were not able to recover the osqueryd process for some reason,
484+
// kill the process and hope that the operating system scheduling
485+
// mechanism (launchd, etc) can relaunch the tool cleanly.
486+
level.Error(o.logger).Log("err", errors.Wrap(recoveryError, "could not recover the osqueryd process"))
487+
os.Exit(1)
488+
}
489+
return
490+
}
454491
}
455492
}()
456493

@@ -471,7 +508,7 @@ func (o *OsqueryInstance) beginTeardown() bool {
471508
// release resources because Kill() expects the osquery instance to be healthy,
472509
// whereas Recover() expects a hostile environment and is slightly more
473510
// defensive in it's actions.
474-
func (o *OsqueryInstance) Recover(runtimeError error) error {
511+
func (o *OsqueryInstance) Recover() error {
475512
// If the user explicitly calls o.Kill(), as the components are shutdown, they
476513
// may exit with errors. In this case, we shouldn't recover the
477514
// instance.
@@ -545,11 +582,16 @@ func (o *OsqueryInstance) Restart() error {
545582
// being managed by the current instantiation of this OsqueryInstance is
546583
// healthy.
547584
func (o *OsqueryInstance) Healthy() (bool, error) {
548-
status, err := o.extensionManagerServer.Ping()
585+
serverStatus, err := o.extensionManagerServer.Ping()
586+
if err != nil {
587+
return false, errors.Wrap(err, "could not ping extension server")
588+
}
589+
590+
clientStatus, err := o.extensionManagerClient.Ping()
549591
if err != nil {
550-
return false, errors.Wrap(err, "could not ping osquery through extension interface")
592+
return false, errors.Wrap(err, "could not ping osquery extension client")
551593
}
552-
return status.Code == 0, nil
594+
return serverStatus.Code == 0 && clientStatus.Code == 0, nil
553595
}
554596

555597
func (o *OsqueryInstance) Query(query string) ([]map[string]string, error) {
@@ -579,6 +621,7 @@ func (o *OsqueryInstance) relaunchAndReplace() error {
579621
WithLoggerPluginFlag(o.loggerPluginFlag),
580622
WithDistributedPluginFlag(o.distributedPluginFlag),
581623
WithRetries(o.retries),
624+
WithLogger(o.logger),
582625
}
583626
if !o.usingTempDir {
584627
opts = append(opts, WithRootDirectory(o.rootDirectory))

osquery/runtime_test.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,32 @@ func TestRecover(t *testing.T) {
149149
)
150150
require.NoError(t, err)
151151

152-
require.NoError(t, instance.Recover(errors.New("fabricated in a test")))
153-
require.NoError(t, instance.Recover(errors.New("fabricated in a test")))
152+
require.NoError(t, instance.Recover())
153+
require.NoError(t, instance.Recover())
154+
time.Sleep(1 * time.Second)
155+
156+
healthy, err := instance.Healthy()
157+
require.NoError(t, err)
158+
require.True(t, healthy)
159+
160+
require.NoError(t, instance.Kill())
161+
}
162+
163+
func TestOsqueryDies(t *testing.T) {
164+
t.Parallel()
165+
rootDirectory, rmRootDirectory, err := osqueryTempDir()
166+
require.NoError(t, err)
167+
defer rmRootDirectory()
168+
169+
require.NoError(t, buildOsqueryExtensionInBinDir(getBinDir(t)))
170+
instance, err := LaunchOsqueryInstance(
171+
WithRootDirectory(rootDirectory),
172+
WithRetries(3),
173+
)
174+
require.NoError(t, err)
175+
176+
require.NoError(t, instance.cmd.Process.Kill())
177+
time.Sleep(3 * time.Second)
154178

155179
healthy, err := instance.Healthy()
156180
require.NoError(t, err)

0 commit comments

Comments
 (0)