Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RSDK-9706: Fix FTDC tracking of module restarts/crashes. #4695

Merged
merged 2 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ftdc/cmd/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,9 @@ func main() {
nolintPrintln("reset range")
nolintPrintln("- Unset any prior range. \"zoom out to full\"")
nolintPrintln()
nolintPrintln("r, refresh")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive-by to document the "help" output for something I added in a prior PR.

nolintPrintln("- Regenerate the plot.png image. Useful when a current viam-server is running.")
nolintPrintln()
nolintPrintln("`quit` or Ctrl-d to exit")
case strings.HasPrefix(cmd, "range "):
pieces := strings.SplitN(cmd, " ", 3)
Expand Down
12 changes: 10 additions & 2 deletions module/modmanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,10 @@ func (mgr *Manager) newOnUnexpectedExitHandler(mod *module) func(exitCode int) b
"error", err)
}

if mgr.ftdc != nil {
mgr.ftdc.Remove(mod.getFTDCName())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bug fix for modules crashing -- this was just missing altogether before.

}

// If attemptRestart returns any orphaned resource names, restart failed,
// and we should remove orphaned resources. Since we handle process
// restarting ourselves, return false here so goutils knows not to attempt
Expand Down Expand Up @@ -1225,7 +1229,7 @@ func (m *module) stopProcess() error {
// of metrics will be reported. Therefore it is safe to continue monitoring the module process
// while it's in shutdown.
if m.ftdc != nil {
m.ftdc.Remove(m.process.ID())
m.ftdc.Remove(m.getFTDCName())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The happy path for a stop -> restart (presumably a module upgrade falls into this category).

}
}()

Expand Down Expand Up @@ -1347,6 +1351,10 @@ func (m *module) getFullEnvironment(viamHomeDir string) map[string]string {
return getFullEnvironment(m.cfg, m.dataDir, viamHomeDir)
}

func (m *module) getFTDCName() string {
return fmt.Sprintf("proc.modules.%s", m.process.ID())
}

func (m *module) registerProcessWithFTDC() {
if m.ftdc == nil {
return
Expand All @@ -1364,7 +1372,7 @@ func (m *module) registerProcessWithFTDC() {
return
}

m.ftdc.Add(fmt.Sprintf("proc.modules.%s", m.process.ID()), statser)
m.ftdc.Add(m.getFTDCName(), statser)
}

func getFullEnvironment(
Expand Down
86 changes: 86 additions & 0 deletions module/modmanager/manager_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package modmanager

import (
"bytes"
"context"
"encoding/json"
"errors"
Expand Down Expand Up @@ -28,6 +29,7 @@ import (
"go.viam.com/rdk/components/generic"
"go.viam.com/rdk/components/motor"
"go.viam.com/rdk/config"
"go.viam.com/rdk/ftdc"
"go.viam.com/rdk/logging"
modlib "go.viam.com/rdk/module"
modmanageroptions "go.viam.com/rdk/module/modmanager/options"
Expand Down Expand Up @@ -1379,6 +1381,90 @@ func TestBadModuleFailsFast(t *testing.T) {
test.That(t, err.Error(), test.ShouldContainSubstring, "module test-module exited too quickly after attempted startup")
}

// TestFTDCAfterModuleCrash is to give confidence that the FTDC sections devoted to tracking module
// process information (e.g: CPU usage) is in sync with the Process IDs (PIDs) that are actually
// running.
func TestFTDCAfterModuleCrash(t *testing.T) {
logger := logging.NewTestLogger(t)
modCfgs := []config.Module{
{
Name: "test-module",
// testmodule2 has a `kill_module` DoCommand to force a module/process crash.
ExePath: rtestutils.BuildTempModule(t, "module/testmodule2"),
Type: config.ModuleTypeLocal,
},
}

ctx := context.Background()
parentAddr := setupSocketWithRobot(t)
opts := modmanageroptions.Options{UntrustedEnv: false}

// Start up a mod manager with FTDC enabled. We will inspect the FTDC output for the
// `ElapsedTimeSecs` to assert the "pid tracking code" is working correctly.
ftdcData := bytes.NewBuffer(nil)
opts.FTDC = ftdc.NewWithWriter(ftdcData, logger)
// Normally a test would explicitly call `constructDatum` to control/guarantee FTDC gets
// data. But as a short-cut to avoid exposing methods that are currently private, we just run
// FTDC in the background. And sleep long enough between testing events (killing the module) to
// assert the right behavior.
opts.FTDC.Start()

// Set up a mod manager. Currently there are zero modules running.
mgr := setupModManager(t, ctx, parentAddr, logger, opts)

// Add a module, this will register an FTDC "section" for that module process.
err := mgr.Add(ctx, modCfgs...)
test.That(t, err, test.ShouldBeNil)

// Add a resource -- this is simply to invoke the `kill_module` command.
res, err := mgr.AddResource(ctx, resource.Config{
Name: "foo",
API: generic.API,
Model: resource.NewModel("rdk", "test", "helper2"),
}, nil)
test.That(t, err, test.ShouldBeNil)
test.That(t, mgr.IsModularResource(generic.Named("foo")), test.ShouldBeTrue)

// Kill the module a few times for good measure.
for idx := 0; idx < 3; idx++ {
_, err = res.DoCommand(ctx, map[string]interface{}{"command": "kill_module"})

// FTDC is running in the background with a one second interval. So we sleep for two seconds
// and cross our fingers we don't get a poor scheduler execution. The assertions are
// intentionally weak to minimize the risk of false positives (a test failure with correct
// production code).
time.Sleep(2 * time.Second)
}

mgr.Close(ctx)
opts.FTDC.StopAndJoin(ctx)

datums, err := ftdc.Parse(ftdcData)
logger.Info("Num ftdc datums: ", len(datums))

// Keep count of the number of `ElapsedTimeSecs` readings we encounter. It is a testing bug if
// we don't see any process FTDC metrics for the module.
numModuleElapsedTimeMetricsSeen := 0
for _, datum := range datums {
for _, reading := range datum.Readings {
if reading.MetricName == "proc.modules.test-module.ElapsedTimeSecs" {
logger.Infow("Reading", "timestamp", datum.Time, "elapsedTimeSecs", reading.Value)
numModuleElapsedTimeMetricsSeen++
// Dan: I don't have a good reason to believe that we can't (legitimately) observe
// an `ElapsedTimeSecs` of 0 here. It's more likely we'd see a 0 because we queried
// a bad PID.
//
// If my assumption is wrong and we get a false positive here, we can reevaluate the
// options for making a more robust test.
test.That(t, reading.Value, test.ShouldBeGreaterThan, 0)
}
}
}

// Assert that we saw at least one datapoint before considering the test a success.
test.That(t, numModuleElapsedTimeMetricsSeen, test.ShouldBeGreaterThan, 0)
}

func TestModularDiscoverFunc(t *testing.T) {
ctx := context.Background()
logger := logging.NewTestLogger(t)
Expand Down
Loading