From 98d9090400b996e8ec933ec1da27328d9f86c292 Mon Sep 17 00:00:00 2001 From: Dan Gottlieb Date: Wed, 8 Jan 2025 16:14:08 -0500 Subject: [PATCH 1/2] RSDK-9706: Fix FTDC tracking of module restarts/crashes. --- ftdc/cmd/parser.go | 3 ++ module/modmanager/manager.go | 12 ++++- module/modmanager/manager_test.go | 86 +++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/ftdc/cmd/parser.go b/ftdc/cmd/parser.go index 6370129436a..8e718c05e3e 100644 --- a/ftdc/cmd/parser.go +++ b/ftdc/cmd/parser.go @@ -441,6 +441,9 @@ func main() { nolintPrintln("reset range") nolintPrintln("- Unset any prior range. \"zoom out to full\"") nolintPrintln() + nolintPrintln("r, refresh") + nolintPrintln("- Regenerate the plot.png image. Useful when a current viam-server is running.") + nolintPrintln() nolintPrintln("`quit` or Ctrl-d to exit") case strings.HasPrefix(cmd, "range "): pieces := strings.SplitN(cmd, " ", 3) diff --git a/module/modmanager/manager.go b/module/modmanager/manager.go index 1f333825bdc..ad916237ccd 100644 --- a/module/modmanager/manager.go +++ b/module/modmanager/manager.go @@ -867,6 +867,10 @@ func (mgr *Manager) newOnUnexpectedExitHandler(mod *module) func(exitCode int) b "error", err) } + if mgr.ftdc != nil { + mgr.ftdc.Remove(mod.getFTDCName()) + } + // If attemptRestart returns any orphaned resource names, restart failed, // and we should remove orphaned resources. Since we handle process // restarting ourselves, return false here so goutils knows not to attempt @@ -1225,7 +1229,7 @@ func (m *module) stopProcess() error { // of metrics will be reported. Therefore it is safe to continue monitoring the module process // while it's in shutdown. if m.ftdc != nil { - m.ftdc.Remove(m.process.ID()) + m.ftdc.Remove(m.getFTDCName()) } }() @@ -1347,6 +1351,10 @@ func (m *module) getFullEnvironment(viamHomeDir string) map[string]string { return getFullEnvironment(m.cfg, m.dataDir, viamHomeDir) } +func (m *module) getFTDCName() string { + return fmt.Sprintf("proc.modules.%s", m.process.ID()) +} + func (m *module) registerProcessWithFTDC() { if m.ftdc == nil { return @@ -1364,7 +1372,7 @@ func (m *module) registerProcessWithFTDC() { return } - m.ftdc.Add(fmt.Sprintf("proc.modules.%s", m.process.ID()), statser) + m.ftdc.Add(m.getFTDCName(), statser) } func getFullEnvironment( diff --git a/module/modmanager/manager_test.go b/module/modmanager/manager_test.go index 23362cd9917..e85a6ebfb90 100644 --- a/module/modmanager/manager_test.go +++ b/module/modmanager/manager_test.go @@ -1,6 +1,7 @@ package modmanager import ( + "bytes" "context" "encoding/json" "errors" @@ -28,6 +29,7 @@ import ( "go.viam.com/rdk/components/generic" "go.viam.com/rdk/components/motor" "go.viam.com/rdk/config" + "go.viam.com/rdk/ftdc" "go.viam.com/rdk/logging" modlib "go.viam.com/rdk/module" modmanageroptions "go.viam.com/rdk/module/modmanager/options" @@ -1379,6 +1381,90 @@ func TestBadModuleFailsFast(t *testing.T) { test.That(t, err.Error(), test.ShouldContainSubstring, "module test-module exited too quickly after attempted startup") } +// TestFTDCAfterModuleCrash is to give confidence that the FTDC sections devoted to tracking module +// process information (e.g: CPU usage) is in sync with the Process IDs (PIDs) that are actually +// running. +func TestFTDCAfterModuleCrash(t *testing.T) { + logger := logging.NewTestLogger(t) + modCfgs := []config.Module{ + { + Name: "test-module", + // testmodule2 has a `kill_module` DoCommand to force a module/process crash. + ExePath: rtestutils.BuildTempModule(t, "module/testmodule2"), + Type: config.ModuleTypeLocal, + }, + } + + ctx := context.Background() + parentAddr := setupSocketWithRobot(t) + opts := modmanageroptions.Options{UntrustedEnv: false} + + // Start up a mod manager with FTDC enabled. We will inspect the FTDC output for the + // `ElapsedTimeSecs` to assert the "pid tracking code" is working correctly. + ftdcData := bytes.NewBuffer(nil) + opts.FTDC = ftdc.NewWithWriter(ftdcData, logger) + // Normally a test would explicitly call `constructDatum` to control/guarantee FTDC gets + // data. But as a short-cut to avoid exposing methods that are currently private, we just run + // FTDC in the background. And sleep long enough between testing events (killing the module) to + // assert the right behavior. + opts.FTDC.Start() + + // Set up a mod manager. Currently there are zero modules running. + mgr := setupModManager(t, ctx, parentAddr, logger, opts) + + // Add a module, this will register an FTDC "section" for that module process. + err := mgr.Add(ctx, modCfgs...) + test.That(t, err, test.ShouldBeNil) + + // Add a resource -- this is simply to invoke the `kill_module` command. + res, err := mgr.AddResource(ctx, resource.Config{ + Name: "foo", + API: generic.API, + Model: resource.NewModel("rdk", "test", "helper2"), + }, nil) + test.That(t, err, test.ShouldBeNil) + test.That(t, mgr.IsModularResource(generic.Named("foo")), test.ShouldBeTrue) + + // Kill the module a few times for good measure. + for idx := 0; idx < 3; idx++ { + _, err = res.DoCommand(ctx, map[string]interface{}{"command": "kill_module"}) + + // FTDC is running in the background with a one second interval. So we sleep for two seconds + // and cross our fingers we don't get a poor scheduler execution. The assertions are + // intentionally weak to minimize the risk of false positives (a test failure with correct + // production code). + time.Sleep(2 * time.Second) + } + + mgr.Close(ctx) + opts.FTDC.StopAndJoin(ctx) + + datums, err := ftdc.Parse(ftdcData) + logger.Info("Num ftdc datums: ", len(datums)) + + // Keep count of the number of `ElapsedTimeSecs` readings we encounter. It is a testing bug if + // we don't see any process FTDC metrics for the module. + numModuleElapsedTimeMetricsSeen := 0 + for _, datum := range datums { + for _, reading := range datum.Readings { + if reading.MetricName == "proc.modules.test-module.ElapsedTimeSecs" { + logger.Infow("Reading", "timestamp", datum.Time, "elapsedTimeSecs", reading.Value) + numModuleElapsedTimeMetricsSeen++ + // Dan: I don't have a good reason to believe that we can't (legitimately) observe + // an `ElapsedTimeSecs` of 0 here. It's more likely we'd see a 0 because we queried + // a bad PID. + // + // If my assumption is wrong and we get a false positive here, we can reevaluate the + // options for making a more robust test. + test.That(t, reading.Value, test.ShouldBeGreaterThan, 0) + } + } + } + + // Assert that we saw at least one datapoint before considering the test a success. + test.That(t, numModuleElapsedTimeMetricsSeen, test.ShouldBeGreaterThan, 0) +} + func TestModularDiscoverFunc(t *testing.T) { ctx := context.Background() logger := logging.NewTestLogger(t) From 43bd1e55a55fc90975d3c6e09c275f3eaa2aeb2d Mon Sep 17 00:00:00 2001 From: Dan Gottlieb Date: Thu, 9 Jan 2025 21:11:43 -0500 Subject: [PATCH 2/2] lint --- module/modmanager/manager_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/modmanager/manager_test.go b/module/modmanager/manager_test.go index e85a6ebfb90..67235710bf6 100644 --- a/module/modmanager/manager_test.go +++ b/module/modmanager/manager_test.go @@ -1427,7 +1427,7 @@ func TestFTDCAfterModuleCrash(t *testing.T) { // Kill the module a few times for good measure. for idx := 0; idx < 3; idx++ { - _, err = res.DoCommand(ctx, map[string]interface{}{"command": "kill_module"}) + _, _ = res.DoCommand(ctx, map[string]interface{}{"command": "kill_module"}) // FTDC is running in the background with a one second interval. So we sleep for two seconds // and cross our fingers we don't get a poor scheduler execution. The assertions are @@ -1440,6 +1440,7 @@ func TestFTDCAfterModuleCrash(t *testing.T) { opts.FTDC.StopAndJoin(ctx) datums, err := ftdc.Parse(ftdcData) + test.That(t, err, test.ShouldBeNil) logger.Info("Num ftdc datums: ", len(datums)) // Keep count of the number of `ElapsedTimeSecs` readings we encounter. It is a testing bug if