health, ipn/ipnlocal: track, log overall health

Updates #1505

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
pull/1397/head
Brad Fitzpatrick 2021-03-15 22:20:48 -07:00
parent 6fbc9b3a98
commit 3c984ad7d5
3 changed files with 98 additions and 25 deletions

View File

@ -179,7 +179,10 @@ func NewNoStart(opts Options) (*Client, error) {
}
func (c *Client) onHealthChange(key string, err error) {
func (c *Client) onHealthChange(key health.ErrorKey, err error) {
if key == health.KeyOverall {
return
}
c.logf("controlclient: restarting map request for %q health change to new state: %v", key, err)
c.cancelMapSafely()
}

View File

@ -7,6 +7,8 @@
package health
import (
"errors"
"fmt"
"sync"
"time"
@ -17,8 +19,9 @@ var (
// mu guards everything in this var block.
mu sync.Mutex
m = map[string]error{} // error key => err (or nil for no error)
watchers = map[*watchHandle]func(string, error){} // opt func to run if error state changes
m = map[ErrorKey]error{} // error key => err (or nil for no error)
watchers = map[*watchHandle]func(ErrorKey, error){} // opt func to run if error state changes
timer *time.Timer
inMapPoll bool
inMapPollSince time.Time
@ -32,21 +35,35 @@ var (
ipnWantRunning bool
)
// ErrorKey is an overall category for which an error is being reported.
type ErrorKey string
const (
KeyOverall = ErrorKey("overall")
)
type watchHandle byte
// RegisterWatcher adds a function that will be called if an
// error changes state either to unhealthy or from unhealthy. It is
// not called on transition from unknown to healthy. It must be non-nil
// and is run in its own goroutine. The returned func unregisters it.
func RegisterWatcher(cb func(errKey string, err error)) (unregister func()) {
func RegisterWatcher(cb func(key ErrorKey, err error)) (unregister func()) {
mu.Lock()
defer mu.Unlock()
handle := new(watchHandle)
watchers[handle] = cb
if timer == nil {
timer = time.AfterFunc(time.Minute, timerSelfCheck)
}
return func() {
mu.Lock()
defer mu.Unlock()
delete(watchers, handle)
if len(watchers) == 0 && timer != nil {
timer.Stop()
timer = nil
}
}
}
@ -62,15 +79,19 @@ func SetNetworkCategoryHealth(err error) { set("network-category", err) }
func NetworkCategoryHealth() error { return get("network-category") }
func get(key string) error {
func get(key ErrorKey) error {
mu.Lock()
defer mu.Unlock()
return m[key]
}
func set(key string, err error) {
func set(key ErrorKey, err error) {
mu.Lock()
defer mu.Unlock()
setLocked(key, err)
}
func setLocked(key ErrorKey, err error) {
old, ok := m[key]
if !ok && err == nil {
// Initial happy path.
@ -162,14 +183,51 @@ func SetIPNState(state string, wantRunning bool) {
selfCheckLocked()
}
func timerSelfCheck() {
mu.Lock()
defer mu.Unlock()
selfCheckLocked()
if timer != nil {
timer.Reset(time.Minute)
}
}
func selfCheckLocked() {
// TODO: check states against each other.
// For staticcheck for now:
if ipnState == "" {
// Don't check yet.
return
}
setLocked(KeyOverall, overallErrorLocked())
}
func overallErrorLocked() error {
if ipnState != "Running" || !ipnWantRunning {
return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning)
}
now := time.Now()
if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) {
return errors.New("not in map poll")
}
const tooIdle = 2*time.Minute + 5*time.Second
if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle {
return fmt.Errorf("no map response in %v", d)
}
rid := derpHomeRegion
if rid == 0 {
return errors.New("no DERP home")
}
if !derpRegionConnected[rid] {
return fmt.Errorf("not connected to home DERP region %v", rid)
}
if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d)
}
// TODO: use
_ = inMapPollSince
_ = lastMapPollEndedAt
_ = lastStreamedMapResponse
_ = derpHomeRegion
_ = lastMapRequestHeard
_ = ipnState
_ = ipnWantRunning
return nil
}

View File

@ -65,20 +65,21 @@ func getControlDebugFlags() []string {
// state machine generates events back out to zero or more components.
type LocalBackend struct {
// Elements that are thread-safe or constant after construction.
ctx context.Context // canceled by Close
ctxCancel context.CancelFunc // cancels ctx
logf logger.Logf // general logging
keyLogf logger.Logf // for printing list of peers on change
statsLogf logger.Logf // for printing peers stats on change
e wgengine.Engine
store ipn.StateStore
backendLogID string
unregisterLinkMon func()
portpoll *portlist.Poller // may be nil
portpollOnce sync.Once // guards starting readPoller
gotPortPollRes chan struct{} // closed upon first readPoller result
serverURL string // tailcontrol URL
newDecompressor func() (controlclient.Decompressor, error)
ctx context.Context // canceled by Close
ctxCancel context.CancelFunc // cancels ctx
logf logger.Logf // general logging
keyLogf logger.Logf // for printing list of peers on change
statsLogf logger.Logf // for printing peers stats on change
e wgengine.Engine
store ipn.StateStore
backendLogID string
unregisterLinkMon func()
unregisterHealthWatch func()
portpoll *portlist.Poller // may be nil
portpollOnce sync.Once // guards starting readPoller
gotPortPollRes chan struct{} // closed upon first readPoller result
serverURL string // tailcontrol URL
newDecompressor func() (controlclient.Decompressor, error)
filterHash string
@ -148,6 +149,8 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, e wge
b.linkChange(false, linkMon.InterfaceState())
b.unregisterLinkMon = linkMon.RegisterChangeCallback(b.linkChange)
b.unregisterHealthWatch = health.RegisterWatcher(b.onHealthChange)
return b, nil
}
@ -182,6 +185,14 @@ func (b *LocalBackend) linkChange(major bool, ifst *interfaces.State) {
b.updateFilter(b.netMap, b.prefs)
}
func (b *LocalBackend) onHealthChange(errKey health.ErrorKey, err error) {
if err == nil {
b.logf("health(%q): ok", errKey)
} else {
b.logf("health(%q): error: %v", errKey, err)
}
}
// Shutdown halts the backend and all its sub-components. The backend
// can no longer be used after Shutdown returns.
func (b *LocalBackend) Shutdown() {
@ -190,6 +201,7 @@ func (b *LocalBackend) Shutdown() {
b.mu.Unlock()
b.unregisterLinkMon()
b.unregisterHealthWatch()
if cli != nil {
cli.Shutdown()
}