health, ipn/ipnlocal: track, log overall health
Updates #1505 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>pull/1397/head
parent
6fbc9b3a98
commit
3c984ad7d5
|
@ -179,7 +179,10 @@ func NewNoStart(opts Options) (*Client, error) {
|
|||
|
||||
}
|
||||
|
||||
func (c *Client) onHealthChange(key string, err error) {
|
||||
func (c *Client) onHealthChange(key health.ErrorKey, err error) {
|
||||
if key == health.KeyOverall {
|
||||
return
|
||||
}
|
||||
c.logf("controlclient: restarting map request for %q health change to new state: %v", key, err)
|
||||
c.cancelMapSafely()
|
||||
}
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
package health
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -17,8 +19,9 @@ var (
|
|||
// mu guards everything in this var block.
|
||||
mu sync.Mutex
|
||||
|
||||
m = map[string]error{} // error key => err (or nil for no error)
|
||||
watchers = map[*watchHandle]func(string, error){} // opt func to run if error state changes
|
||||
m = map[ErrorKey]error{} // error key => err (or nil for no error)
|
||||
watchers = map[*watchHandle]func(ErrorKey, error){} // opt func to run if error state changes
|
||||
timer *time.Timer
|
||||
|
||||
inMapPoll bool
|
||||
inMapPollSince time.Time
|
||||
|
@ -32,21 +35,35 @@ var (
|
|||
ipnWantRunning bool
|
||||
)
|
||||
|
||||
// ErrorKey is an overall category for which an error is being reported.
|
||||
type ErrorKey string
|
||||
|
||||
const (
|
||||
KeyOverall = ErrorKey("overall")
|
||||
)
|
||||
|
||||
type watchHandle byte
|
||||
|
||||
// RegisterWatcher adds a function that will be called if an
|
||||
// error changes state either to unhealthy or from unhealthy. It is
|
||||
// not called on transition from unknown to healthy. It must be non-nil
|
||||
// and is run in its own goroutine. The returned func unregisters it.
|
||||
func RegisterWatcher(cb func(errKey string, err error)) (unregister func()) {
|
||||
func RegisterWatcher(cb func(key ErrorKey, err error)) (unregister func()) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
handle := new(watchHandle)
|
||||
watchers[handle] = cb
|
||||
if timer == nil {
|
||||
timer = time.AfterFunc(time.Minute, timerSelfCheck)
|
||||
}
|
||||
return func() {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
delete(watchers, handle)
|
||||
if len(watchers) == 0 && timer != nil {
|
||||
timer.Stop()
|
||||
timer = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -62,15 +79,19 @@ func SetNetworkCategoryHealth(err error) { set("network-category", err) }
|
|||
|
||||
func NetworkCategoryHealth() error { return get("network-category") }
|
||||
|
||||
func get(key string) error {
|
||||
func get(key ErrorKey) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
return m[key]
|
||||
}
|
||||
|
||||
func set(key string, err error) {
|
||||
func set(key ErrorKey, err error) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
setLocked(key, err)
|
||||
}
|
||||
|
||||
func setLocked(key ErrorKey, err error) {
|
||||
old, ok := m[key]
|
||||
if !ok && err == nil {
|
||||
// Initial happy path.
|
||||
|
@ -162,14 +183,51 @@ func SetIPNState(state string, wantRunning bool) {
|
|||
selfCheckLocked()
|
||||
}
|
||||
|
||||
func timerSelfCheck() {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
selfCheckLocked()
|
||||
if timer != nil {
|
||||
timer.Reset(time.Minute)
|
||||
}
|
||||
}
|
||||
|
||||
func selfCheckLocked() {
|
||||
// TODO: check states against each other.
|
||||
// For staticcheck for now:
|
||||
if ipnState == "" {
|
||||
// Don't check yet.
|
||||
return
|
||||
}
|
||||
setLocked(KeyOverall, overallErrorLocked())
|
||||
}
|
||||
|
||||
func overallErrorLocked() error {
|
||||
if ipnState != "Running" || !ipnWantRunning {
|
||||
return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning)
|
||||
}
|
||||
now := time.Now()
|
||||
if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) {
|
||||
return errors.New("not in map poll")
|
||||
}
|
||||
const tooIdle = 2*time.Minute + 5*time.Second
|
||||
if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle {
|
||||
return fmt.Errorf("no map response in %v", d)
|
||||
}
|
||||
rid := derpHomeRegion
|
||||
if rid == 0 {
|
||||
return errors.New("no DERP home")
|
||||
}
|
||||
if !derpRegionConnected[rid] {
|
||||
return fmt.Errorf("not connected to home DERP region %v", rid)
|
||||
}
|
||||
if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
|
||||
return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d)
|
||||
}
|
||||
|
||||
// TODO: use
|
||||
_ = inMapPollSince
|
||||
_ = lastMapPollEndedAt
|
||||
_ = lastStreamedMapResponse
|
||||
_ = derpHomeRegion
|
||||
_ = lastMapRequestHeard
|
||||
_ = ipnState
|
||||
_ = ipnWantRunning
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -65,20 +65,21 @@ func getControlDebugFlags() []string {
|
|||
// state machine generates events back out to zero or more components.
|
||||
type LocalBackend struct {
|
||||
// Elements that are thread-safe or constant after construction.
|
||||
ctx context.Context // canceled by Close
|
||||
ctxCancel context.CancelFunc // cancels ctx
|
||||
logf logger.Logf // general logging
|
||||
keyLogf logger.Logf // for printing list of peers on change
|
||||
statsLogf logger.Logf // for printing peers stats on change
|
||||
e wgengine.Engine
|
||||
store ipn.StateStore
|
||||
backendLogID string
|
||||
unregisterLinkMon func()
|
||||
portpoll *portlist.Poller // may be nil
|
||||
portpollOnce sync.Once // guards starting readPoller
|
||||
gotPortPollRes chan struct{} // closed upon first readPoller result
|
||||
serverURL string // tailcontrol URL
|
||||
newDecompressor func() (controlclient.Decompressor, error)
|
||||
ctx context.Context // canceled by Close
|
||||
ctxCancel context.CancelFunc // cancels ctx
|
||||
logf logger.Logf // general logging
|
||||
keyLogf logger.Logf // for printing list of peers on change
|
||||
statsLogf logger.Logf // for printing peers stats on change
|
||||
e wgengine.Engine
|
||||
store ipn.StateStore
|
||||
backendLogID string
|
||||
unregisterLinkMon func()
|
||||
unregisterHealthWatch func()
|
||||
portpoll *portlist.Poller // may be nil
|
||||
portpollOnce sync.Once // guards starting readPoller
|
||||
gotPortPollRes chan struct{} // closed upon first readPoller result
|
||||
serverURL string // tailcontrol URL
|
||||
newDecompressor func() (controlclient.Decompressor, error)
|
||||
|
||||
filterHash string
|
||||
|
||||
|
@ -148,6 +149,8 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, e wge
|
|||
b.linkChange(false, linkMon.InterfaceState())
|
||||
b.unregisterLinkMon = linkMon.RegisterChangeCallback(b.linkChange)
|
||||
|
||||
b.unregisterHealthWatch = health.RegisterWatcher(b.onHealthChange)
|
||||
|
||||
return b, nil
|
||||
}
|
||||
|
||||
|
@ -182,6 +185,14 @@ func (b *LocalBackend) linkChange(major bool, ifst *interfaces.State) {
|
|||
b.updateFilter(b.netMap, b.prefs)
|
||||
}
|
||||
|
||||
func (b *LocalBackend) onHealthChange(errKey health.ErrorKey, err error) {
|
||||
if err == nil {
|
||||
b.logf("health(%q): ok", errKey)
|
||||
} else {
|
||||
b.logf("health(%q): error: %v", errKey, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown halts the backend and all its sub-components. The backend
|
||||
// can no longer be used after Shutdown returns.
|
||||
func (b *LocalBackend) Shutdown() {
|
||||
|
@ -190,6 +201,7 @@ func (b *LocalBackend) Shutdown() {
|
|||
b.mu.Unlock()
|
||||
|
||||
b.unregisterLinkMon()
|
||||
b.unregisterHealthWatch()
|
||||
if cli != nil {
|
||||
cli.Shutdown()
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue