Fix slow shutdown (#2113)

* rearrange shutdown

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

* http closed is fine

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

* update changelog

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

* logging while shutting

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

---------

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2024-09-09 14:10:22 +02:00 committed by GitHub
parent bac7ea67f4
commit 60b94b0467
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 57 additions and 21 deletions

View file

@ -74,6 +74,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
- Make sure integration tests cover postgres for all scenarios - Make sure integration tests cover postgres for all scenarios
- CLI commands (all except `serve`) only requires minimal configuration, no more errors or warnings from unset settings [#2109](https://github.com/juanfont/headscale/pull/2109) - CLI commands (all except `serve`) only requires minimal configuration, no more errors or warnings from unset settings [#2109](https://github.com/juanfont/headscale/pull/2109)
- CLI results are now concistently sent to stdout and errors to stderr [#2109](https://github.com/juanfont/headscale/pull/2109) - CLI results are now concistently sent to stdout and errors to stderr [#2109](https://github.com/juanfont/headscale/pull/2109)
- Fix issue where shutting down headscale would hang [#2113](https://github.com/juanfont/headscale/pull/2113)
## 0.22.3 (2023-05-12) ## 0.22.3 (2023-05-12)

View file

@ -1,6 +1,9 @@
package cli package cli
import ( import (
"errors"
"net/http"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
@ -22,8 +25,8 @@ var serveCmd = &cobra.Command{
} }
err = app.Serve() err = app.Serve()
if err != nil { if err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Fatal().Caller().Err(err).Msg("Error starting server") log.Fatal().Caller().Err(err).Msg("Headscale ran into an error and had to shut down.")
} }
}, },
} }

View file

@ -770,7 +770,7 @@ func (h *Headscale) Serve() error {
}) })
} }
default: default:
trace := log.Trace().Msgf info := func(msg string) { log.Info().Msg(msg) }
log.Info(). log.Info().
Str("signal", sig.String()). Str("signal", sig.String()).
Msg("Received signal to stop, shutting down gracefully") Msg("Received signal to stop, shutting down gracefully")
@ -778,55 +778,55 @@ func (h *Headscale) Serve() error {
expireNodeCancel() expireNodeCancel()
h.ephemeralGC.Close() h.ephemeralGC.Close()
trace("waiting for netmap stream to close")
h.pollNetMapStreamWG.Wait()
// Gracefully shut down servers // Gracefully shut down servers
ctx, cancel := context.WithTimeout( ctx, cancel := context.WithTimeout(
context.Background(), context.Background(),
types.HTTPShutdownTimeout, types.HTTPShutdownTimeout,
) )
trace("shutting down debug http server") info("shutting down debug http server")
if err := debugHTTPServer.Shutdown(ctx); err != nil { if err := debugHTTPServer.Shutdown(ctx); err != nil {
log.Error().Err(err).Msg("Failed to shutdown prometheus http") log.Error().Err(err).Msg("failed to shutdown prometheus http")
} }
trace("shutting down main http server") info("shutting down main http server")
if err := httpServer.Shutdown(ctx); err != nil { if err := httpServer.Shutdown(ctx); err != nil {
log.Error().Err(err).Msg("Failed to shutdown http") log.Error().Err(err).Msg("failed to shutdown http")
} }
trace("shutting down grpc server (socket)") info("closing node notifier")
h.nodeNotifier.Close()
info("waiting for netmap stream to close")
h.pollNetMapStreamWG.Wait()
info("shutting down grpc server (socket)")
grpcSocket.GracefulStop() grpcSocket.GracefulStop()
if grpcServer != nil { if grpcServer != nil {
trace("shutting down grpc server (external)") info("shutting down grpc server (external)")
grpcServer.GracefulStop() grpcServer.GracefulStop()
grpcListener.Close() grpcListener.Close()
} }
if tailsqlContext != nil { if tailsqlContext != nil {
trace("shutting down tailsql") info("shutting down tailsql")
tailsqlContext.Done() tailsqlContext.Done()
} }
trace("closing node notifier")
h.nodeNotifier.Close()
// Close network listeners // Close network listeners
trace("closing network listeners") info("closing network listeners")
debugHTTPListener.Close() debugHTTPListener.Close()
httpListener.Close() httpListener.Close()
grpcGatewayConn.Close() grpcGatewayConn.Close()
// Stop listening (and unlink the socket if unix type): // Stop listening (and unlink the socket if unix type):
trace("closing socket listener") info("closing socket listener")
socketListener.Close() socketListener.Close()
// Close db connections // Close db connections
trace("closing database connection") info("closing database connection")
err = h.db.Close() err = h.db.Close()
if err != nil { if err != nil {
log.Error().Err(err).Msg("Failed to close db") log.Error().Err(err).Msg("failed to close db")
} }
log.Info(). log.Info().

View file

@ -36,6 +36,7 @@ type Notifier struct {
connected *xsync.MapOf[types.NodeID, bool] connected *xsync.MapOf[types.NodeID, bool]
b *batcher b *batcher
cfg *types.Config cfg *types.Config
closed bool
} }
func NewNotifier(cfg *types.Config) *Notifier { func NewNotifier(cfg *types.Config) *Notifier {
@ -43,6 +44,7 @@ func NewNotifier(cfg *types.Config) *Notifier {
nodes: make(map[types.NodeID]chan<- types.StateUpdate), nodes: make(map[types.NodeID]chan<- types.StateUpdate),
connected: xsync.NewMapOf[types.NodeID, bool](), connected: xsync.NewMapOf[types.NodeID, bool](),
cfg: cfg, cfg: cfg,
closed: false,
} }
b := newBatcher(cfg.Tuning.BatchChangeDelay, n) b := newBatcher(cfg.Tuning.BatchChangeDelay, n)
n.b = b n.b = b
@ -51,9 +53,19 @@ func NewNotifier(cfg *types.Config) *Notifier {
return n return n
} }
// Close stops the batcher inside the notifier. // Close stops the batcher and closes all channels.
func (n *Notifier) Close() { func (n *Notifier) Close() {
notifierWaitersForLock.WithLabelValues("lock", "close").Inc()
n.l.Lock()
defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "close").Dec()
n.closed = true
n.b.close() n.b.close()
for _, c := range n.nodes {
close(c)
}
} }
func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) { func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) {
@ -70,6 +82,10 @@ func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) {
notifierWaitersForLock.WithLabelValues("lock", "add").Dec() notifierWaitersForLock.WithLabelValues("lock", "add").Dec()
notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds())
if n.closed {
return
}
// If a channel exists, it means the node has opened a new // If a channel exists, it means the node has opened a new
// connection. Close the old channel and replace it. // connection. Close the old channel and replace it.
if curr, ok := n.nodes[nodeID]; ok { if curr, ok := n.nodes[nodeID]; ok {
@ -96,6 +112,10 @@ func (n *Notifier) RemoveNode(nodeID types.NodeID, c chan<- types.StateUpdate) b
notifierWaitersForLock.WithLabelValues("lock", "remove").Dec() notifierWaitersForLock.WithLabelValues("lock", "remove").Dec()
notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds())
if n.closed {
return true
}
if len(n.nodes) == 0 { if len(n.nodes) == 0 {
return true return true
} }
@ -154,6 +174,10 @@ func (n *Notifier) NotifyWithIgnore(
update types.StateUpdate, update types.StateUpdate,
ignoreNodeIDs ...types.NodeID, ignoreNodeIDs ...types.NodeID,
) { ) {
if n.closed {
return
}
notifierUpdateReceived.WithLabelValues(update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc() notifierUpdateReceived.WithLabelValues(update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc()
n.b.addOrPassthrough(update) n.b.addOrPassthrough(update)
} }
@ -170,6 +194,10 @@ func (n *Notifier) NotifyByNodeID(
notifierWaitersForLock.WithLabelValues("lock", "notify").Dec() notifierWaitersForLock.WithLabelValues("lock", "notify").Dec()
notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds())
if n.closed {
return
}
if c, ok := n.nodes[nodeID]; ok { if c, ok := n.nodes[nodeID]; ok {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -205,6 +233,10 @@ func (n *Notifier) sendAll(update types.StateUpdate) {
notifierWaitersForLock.WithLabelValues("lock", "send-all").Dec() notifierWaitersForLock.WithLabelValues("lock", "send-all").Dec()
notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds()) notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds())
if n.closed {
return
}
for id, c := range n.nodes { for id, c := range n.nodes {
// Whenever an update is sent to all nodes, there is a chance that the node // Whenever an update is sent to all nodes, there is a chance that the node
// has disconnected and the goroutine that was supposed to consume the update // has disconnected and the goroutine that was supposed to consume the update