headscale/hscontrol/poll.go

668 lines
19 KiB
Go

package hscontrol
import (
"context"
"fmt"
"net/http"
"strings"
"time"
"github.com/juanfont/headscale/hscontrol/db"
"github.com/juanfont/headscale/hscontrol/mapper"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/rs/zerolog/log"
xslices "golang.org/x/exp/slices"
"gorm.io/gorm"
"tailscale.com/envknob"
"tailscale.com/tailcfg"
)
const (
keepAliveInterval = 60 * time.Second
)
type contextKey string
const nodeNameContextKey = contextKey("nodeName")
type UpdateNode func()
func logPollFunc(
mapRequest tailcfg.MapRequest,
node *types.Node,
) (func(string), func(string), func(error, string)) {
return func(msg string) {
log.Trace().
Caller().
Bool("readOnly", mapRequest.ReadOnly).
Bool("omitPeers", mapRequest.OmitPeers).
Bool("stream", mapRequest.Stream).
Str("node_key", node.NodeKey.ShortString()).
Str("node", node.Hostname).
Msg(msg)
},
func(msg string) {
log.Warn().
Caller().
Bool("readOnly", mapRequest.ReadOnly).
Bool("omitPeers", mapRequest.OmitPeers).
Bool("stream", mapRequest.Stream).
Str("node_key", node.NodeKey.ShortString()).
Str("node", node.Hostname).
Msg(msg)
},
func(err error, msg string) {
log.Error().
Caller().
Bool("readOnly", mapRequest.ReadOnly).
Bool("omitPeers", mapRequest.OmitPeers).
Bool("stream", mapRequest.Stream).
Str("node_key", node.NodeKey.ShortString()).
Str("node", node.Hostname).
Err(err).
Msg(msg)
}
}
// handlePoll ensures the node gets the appropriate updates from either
// polling or immediate responses.
//
//nolint:gocyclo
func (h *Headscale) handlePoll(
writer http.ResponseWriter,
ctx context.Context,
node *types.Node,
mapRequest tailcfg.MapRequest,
) {
logTrace, logWarn, logErr := logPollFunc(mapRequest, node)
// This is the mechanism where the node gives us information about its
// current configuration.
//
// If OmitPeers is true, Stream is false, and ReadOnly is false,
// then te server will let clients update their endpoints without
// breaking existing long-polling (Stream == true) connections.
// In this case, the server can omit the entire response; the client
// only checks the HTTP response status code.
// TODO(kradalby): remove ReadOnly when we only support capVer 68+
if mapRequest.OmitPeers && !mapRequest.Stream && !mapRequest.ReadOnly {
log.Info().
Caller().
Bool("readOnly", mapRequest.ReadOnly).
Bool("omitPeers", mapRequest.OmitPeers).
Bool("stream", mapRequest.Stream).
Str("node_key", node.NodeKey.ShortString()).
Str("node", node.Hostname).
Int("cap_ver", int(mapRequest.Version)).
Msg("Received update")
change := node.PeerChangeFromMapRequest(mapRequest)
online := h.nodeNotifier.IsConnected(node.MachineKey)
change.Online = &online
node.ApplyPeerChange(&change)
hostInfoChange := node.Hostinfo.Equal(mapRequest.Hostinfo)
logTracePeerChange(node.Hostname, hostInfoChange, &change)
// Check if the Hostinfo of the node has changed.
// If it has changed, check if there has been a change tod
// the routable IPs of the host and update update them in
// the database. Then send a Changed update
// (containing the whole node object) to peers to inform about
// the route change.
// If the hostinfo has changed, but not the routes, just update
// hostinfo and let the function continue.
if !hostInfoChange {
oldRoutes := node.Hostinfo.RoutableIPs
newRoutes := mapRequest.Hostinfo.RoutableIPs
oldServicesCount := len(node.Hostinfo.Services)
newServicesCount := len(mapRequest.Hostinfo.Services)
node.Hostinfo = mapRequest.Hostinfo
sendUpdate := false
// Route changes come as part of Hostinfo, which means that
// when an update comes, the Node Route logic need to run.
// This will require a "change" in comparison to a "patch",
// which is more costly.
if !xslices.Equal(oldRoutes, newRoutes) {
var err error
sendUpdate, err = h.db.SaveNodeRoutes(node)
if err != nil {
logErr(err, "Error processing node routes")
http.Error(writer, "", http.StatusInternalServerError)
return
}
if h.ACLPolicy != nil {
// update routes with peer information
update, err := h.db.EnableAutoApprovedRoutes(h.ACLPolicy, node)
if err != nil {
logErr(err, "Error running auto approved routes")
}
if update != nil {
sendUpdate = true
}
}
}
// Services is mostly useful for discovery and not critical,
// except for peerapi, which is how nodes talk to eachother.
// If peerapi was not part of the initial mapresponse, we
// need to make sure its sent out later as it is needed for
// Taildrop.
// TODO(kradalby): Length comparison is a bit naive, replace.
if oldServicesCount != newServicesCount {
sendUpdate = true
}
if sendUpdate {
if err := h.db.DB.Save(node).Error; err != nil {
logErr(err, "Failed to persist/update node in the database")
http.Error(writer, "", http.StatusInternalServerError)
return
}
// Send an update to all peers to propagate the new routes
// available.
stateUpdate := types.StateUpdate{
Type: types.StatePeerChanged,
ChangeNodes: types.Nodes{node},
Message: "called from handlePoll -> update -> new hostinfo",
}
if stateUpdate.Valid() {
ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-peers-hostinfochange", node.Hostname)
h.nodeNotifier.NotifyWithIgnore(
ctx,
stateUpdate,
node.MachineKey.String())
}
// Send an update to the node itself with to ensure it
// has an updated packetfilter allowing the new route
// if it is defined in the ACL.
selfUpdate := types.StateUpdate{
Type: types.StateSelfUpdate,
ChangeNodes: types.Nodes{node},
}
if selfUpdate.Valid() {
ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-self-hostinfochange", node.Hostname)
h.nodeNotifier.NotifyByMachineKey(
ctx,
selfUpdate,
node.MachineKey)
}
return
}
}
if err := h.db.DB.Save(node).Error; err != nil {
logErr(err, "Failed to persist/update node in the database")
http.Error(writer, "", http.StatusInternalServerError)
return
}
stateUpdate := types.StateUpdate{
Type: types.StatePeerChangedPatch,
ChangePatches: []*tailcfg.PeerChange{&change},
}
if stateUpdate.Valid() {
ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-peers-patch", node.Hostname)
h.nodeNotifier.NotifyWithIgnore(
ctx,
stateUpdate,
node.MachineKey.String())
}
writer.WriteHeader(http.StatusOK)
if f, ok := writer.(http.Flusher); ok {
f.Flush()
}
return
} else if mapRequest.OmitPeers && !mapRequest.Stream && mapRequest.ReadOnly {
// ReadOnly is whether the client just wants to fetch the
// MapResponse, without updating their Endpoints. The
// Endpoints field will be ignored and LastSeen will not be
// updated and peers will not be notified of changes.
//
// The intended use is for clients to discover the DERP map at
// start-up before their first real endpoint update.
} else if mapRequest.OmitPeers && !mapRequest.Stream && mapRequest.ReadOnly {
h.handleLiteRequest(writer, node, mapRequest)
return
} else if mapRequest.OmitPeers && mapRequest.Stream {
logErr(nil, "Ignoring request, don't know how to handle it")
return
}
change := node.PeerChangeFromMapRequest(mapRequest)
// A stream is being set up, the node is Online
online := true
change.Online = &online
node.ApplyPeerChange(&change)
// Only save HostInfo if changed, update routes if changed
// TODO(kradalby): Remove when capver is over 68
if !node.Hostinfo.Equal(mapRequest.Hostinfo) {
oldRoutes := node.Hostinfo.RoutableIPs
newRoutes := mapRequest.Hostinfo.RoutableIPs
node.Hostinfo = mapRequest.Hostinfo
if !xslices.Equal(oldRoutes, newRoutes) {
_, err := h.db.SaveNodeRoutes(node)
if err != nil {
logErr(err, "Error processing node routes")
http.Error(writer, "", http.StatusInternalServerError)
return
}
}
}
if err := h.db.DB.Save(node).Error; err != nil {
logErr(err, "Failed to persist/update node in the database")
http.Error(writer, "", http.StatusInternalServerError)
return
}
// Set up the client stream
h.pollNetMapStreamWG.Add(1)
defer h.pollNetMapStreamWG.Done()
// Use a buffered channel in case a node is not fully ready
// to receive a message to make sure we dont block the entire
// notifier.
// 12 is arbitrarily chosen.
chanSize := 3
if size, ok := envknob.LookupInt("HEADSCALE_TUNING_POLL_QUEUE_SIZE"); ok {
chanSize = size
}
updateChan := make(chan types.StateUpdate, chanSize)
defer closeChanWithLog(updateChan, node.Hostname, "updateChan")
// Register the node's update channel
h.nodeNotifier.AddNode(node.MachineKey, updateChan)
defer h.nodeNotifier.RemoveNode(node.MachineKey)
// When a node connects to control, list the peers it has at
// that given point, further updates are kept in memory in
// the Mapper, which lives for the duration of the polling
// session.
peers, err := h.db.ListPeers(node)
if err != nil {
logErr(err, "Failed to list peers when opening poller")
http.Error(writer, "", http.StatusInternalServerError)
return
}
isConnected := h.nodeNotifier.ConnectedMap()
for _, peer := range peers {
online := isConnected[peer.MachineKey]
peer.IsOnline = &online
}
mapp := mapper.NewMapper(
node,
peers,
h.DERPMap,
h.cfg.BaseDomain,
h.cfg.DNSConfig,
h.cfg.LogTail.Enabled,
h.cfg.RandomizeClientPort,
)
// update ACLRules with peer informations (to update server tags if necessary)
if h.ACLPolicy != nil {
// update routes with peer information
// This state update is ignored as it will be sent
// as part of the whole node
// TODO(kradalby): figure out if that is actually correct
_, err = h.db.EnableAutoApprovedRoutes(h.ACLPolicy, node)
if err != nil {
logErr(err, "Error running auto approved routes")
}
}
logTrace("Sending initial map")
mapResp, err := mapp.FullMapResponse(mapRequest, node, h.ACLPolicy)
if err != nil {
logErr(err, "Failed to create MapResponse")
http.Error(writer, "", http.StatusInternalServerError)
return
}
// Send the client an update to make sure we send an initial mapresponse
_, err = writer.Write(mapResp)
if err != nil {
logErr(err, "Could not write the map response")
return
}
if flusher, ok := writer.(http.Flusher); ok {
flusher.Flush()
} else {
return
}
stateUpdate := types.StateUpdate{
Type: types.StatePeerChanged,
ChangeNodes: types.Nodes{node},
Message: "called from handlePoll -> new node added",
}
if stateUpdate.Valid() {
ctx := types.NotifyCtx(context.Background(), "poll-newnode-peers", node.Hostname)
h.nodeNotifier.NotifyWithIgnore(
ctx,
stateUpdate,
node.MachineKey.String())
}
if len(node.Routes) > 0 {
go h.pollFailoverRoutes(logErr, "new node", node)
}
keepAliveTicker := time.NewTicker(keepAliveInterval)
ctx, cancel := context.WithCancel(context.WithValue(ctx, nodeNameContextKey, node.Hostname))
defer cancel()
for {
logTrace("Waiting for update on stream channel")
select {
case <-keepAliveTicker.C:
data, err := mapp.KeepAliveResponse(mapRequest, node)
if err != nil {
logErr(err, "Error generating the keep alive msg")
return
}
_, err = writer.Write(data)
if err != nil {
logErr(err, "Cannot write keep alive message")
return
}
if flusher, ok := writer.(http.Flusher); ok {
flusher.Flush()
} else {
log.Error().Msg("Failed to create http flusher")
return
}
// This goroutine is not ideal, but we have a potential issue here
// where it blocks too long and that holds up updates.
// One alternative is to split these different channels into
// goroutines, but then you might have a problem without a lock
// if a keepalive is written at the same time as an update.
go h.updateNodeOnlineStatus(true, node)
case update := <-updateChan:
logTrace("Received update")
now := time.Now()
var data []byte
var err error
// Ensure the node object is updated, for example, there
// might have been a hostinfo update in a sidechannel
// which contains data needed to generate a map response.
node, err = h.db.GetNodeByMachineKey(node.MachineKey)
if err != nil {
logErr(err, "Could not get machine from db")
return
}
startMapResp := time.Now()
switch update.Type {
case types.StateFullUpdate:
logTrace("Sending Full MapResponse")
data, err = mapp.FullMapResponse(mapRequest, node, h.ACLPolicy)
case types.StatePeerChanged:
logTrace(fmt.Sprintf("Sending Changed MapResponse: %s", update.Message))
isConnectedMap := h.nodeNotifier.ConnectedMap()
for _, node := range update.ChangeNodes {
// If a node is not reported to be online, it might be
// because the value is outdated, check with the notifier.
// However, if it is set to Online, and not in the notifier,
// this might be because it has announced itself, but not
// reached the stage to actually create the notifier channel.
if node.IsOnline != nil && !*node.IsOnline {
isOnline := isConnectedMap[node.MachineKey]
node.IsOnline = &isOnline
}
}
data, err = mapp.PeerChangedResponse(mapRequest, node, update.ChangeNodes, h.ACLPolicy, update.Message)
case types.StatePeerChangedPatch:
logTrace("Sending PeerChangedPatch MapResponse")
data, err = mapp.PeerChangedPatchResponse(mapRequest, node, update.ChangePatches, h.ACLPolicy)
case types.StatePeerRemoved:
logTrace("Sending PeerRemoved MapResponse")
data, err = mapp.PeerRemovedResponse(mapRequest, node, update.Removed)
case types.StateSelfUpdate:
if len(update.ChangeNodes) == 1 {
logTrace("Sending SelfUpdate MapResponse")
node = update.ChangeNodes[0]
data, err = mapp.LiteMapResponse(mapRequest, node, h.ACLPolicy, types.SelfUpdateIdentifier)
} else {
logWarn("SelfUpdate contained too many nodes, this is likely a bug in the code, please report.")
}
case types.StateDERPUpdated:
logTrace("Sending DERPUpdate MapResponse")
data, err = mapp.DERPMapResponse(mapRequest, node, update.DERPMap)
}
if err != nil {
logErr(err, "Could not get the create map update")
return
}
log.Trace().Str("node", node.Hostname).TimeDiff("timeSpent", time.Now(), startMapResp).Str("mkey", node.MachineKey.String()).Int("type", int(update.Type)).Msg("finished making map response")
// Only send update if there is change
if data != nil {
startWrite := time.Now()
_, err = writer.Write(data)
if err != nil {
logErr(err, "Could not write the map response")
updateRequestsSentToNode.WithLabelValues(node.User.Name, node.Hostname, "failed").
Inc()
return
}
if flusher, ok := writer.(http.Flusher); ok {
flusher.Flush()
} else {
log.Error().Msg("Failed to create http flusher")
return
}
log.Trace().Str("node", node.Hostname).TimeDiff("timeSpent", time.Now(), startWrite).Str("mkey", node.MachineKey.String()).Int("type", int(update.Type)).Msg("finished writing mapresp to node")
log.Info().
Caller().
Bool("readOnly", mapRequest.ReadOnly).
Bool("omitPeers", mapRequest.OmitPeers).
Bool("stream", mapRequest.Stream).
Str("node_key", node.NodeKey.ShortString()).
Str("machine_key", node.MachineKey.ShortString()).
Str("node", node.Hostname).
TimeDiff("timeSpent", time.Now(), now).
Msg("update sent")
}
case <-ctx.Done():
logTrace("The client has closed the connection")
go h.updateNodeOnlineStatus(false, node)
// Failover the node's routes if any.
go h.pollFailoverRoutes(logErr, "node closing connection", node)
// The connection has been closed, so we can stop polling.
return
case <-h.shutdownChan:
logTrace("The long-poll handler is shutting down")
return
}
}
}
func (h *Headscale) pollFailoverRoutes(logErr func(error, string), where string, node *types.Node) {
update, err := db.Write(h.db.DB, func(tx *gorm.DB) (*types.StateUpdate, error) {
return db.EnsureFailoverRouteIsAvailable(tx, h.nodeNotifier.ConnectedMap(), node)
})
if err != nil {
logErr(err, fmt.Sprintf("failed to ensure failover routes, %s", where))
return
}
if update != nil && !update.Empty() && update.Valid() {
ctx := types.NotifyCtx(context.Background(), fmt.Sprintf("poll-%s-routes-ensurefailover", strings.ReplaceAll(where, " ", "-")), node.Hostname)
h.nodeNotifier.NotifyWithIgnore(ctx, *update, node.MachineKey.String())
}
}
// updateNodeOnlineStatus records the last seen status of a node and notifies peers
// about change in their online/offline status.
// It takes a StateUpdateType of either StatePeerOnlineChanged or StatePeerOfflineChanged.
func (h *Headscale) updateNodeOnlineStatus(online bool, node *types.Node) {
now := time.Now()
node.LastSeen = &now
statusUpdate := types.StateUpdate{
Type: types.StatePeerChangedPatch,
ChangePatches: []*tailcfg.PeerChange{
{
NodeID: tailcfg.NodeID(node.ID),
Online: &online,
LastSeen: &now,
},
},
}
if statusUpdate.Valid() {
ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-onlinestatus", node.Hostname)
h.nodeNotifier.NotifyWithIgnore(ctx, statusUpdate, node.MachineKey.String())
}
err := h.db.DB.Transaction(func(tx *gorm.DB) error {
return db.UpdateLastSeen(tx, node.ID, *node.LastSeen)
})
if err != nil {
log.Error().Err(err).Msg("Cannot update node LastSeen")
return
}
}
func closeChanWithLog[C chan []byte | chan struct{} | chan types.StateUpdate](channel C, node, name string) {
log.Trace().
Str("handler", "PollNetMap").
Str("node", node).
Str("channel", "Done").
Msg(fmt.Sprintf("Closing %s channel", name))
close(channel)
}
func (h *Headscale) handleLiteRequest(
writer http.ResponseWriter,
node *types.Node,
mapRequest tailcfg.MapRequest,
) {
logTrace, _, logErr := logPollFunc(mapRequest, node)
mapp := mapper.NewMapper(
node,
types.Nodes{},
h.DERPMap,
h.cfg.BaseDomain,
h.cfg.DNSConfig,
h.cfg.LogTail.Enabled,
h.cfg.RandomizeClientPort,
)
logTrace("Client asked for a lite update, responding without peers")
mapResp, err := mapp.LiteMapResponse(mapRequest, node, h.ACLPolicy)
if err != nil {
logErr(err, "Failed to create MapResponse")
http.Error(writer, "", http.StatusInternalServerError)
return
}
writer.Header().Set("Content-Type", "application/json; charset=utf-8")
writer.WriteHeader(http.StatusOK)
_, err = writer.Write(mapResp)
if err != nil {
logErr(err, "Failed to write response")
}
}
func logTracePeerChange(hostname string, hostinfoChange bool, change *tailcfg.PeerChange) {
trace := log.Trace().Str("node_id", change.NodeID.String()).Str("hostname", hostname)
if change.Key != nil {
trace = trace.Str("node_key", change.Key.ShortString())
}
if change.DiscoKey != nil {
trace = trace.Str("disco_key", change.DiscoKey.ShortString())
}
if change.Online != nil {
trace = trace.Bool("online", *change.Online)
}
if change.Endpoints != nil {
eps := make([]string, len(change.Endpoints))
for idx, ep := range change.Endpoints {
eps[idx] = ep.String()
}
trace = trace.Strs("endpoints", eps)
}
if hostinfoChange {
trace = trace.Bool("hostinfo_changed", hostinfoChange)
}
if change.DERPRegion != 0 {
trace = trace.Int("derp_region", change.DERPRegion)
}
trace.Time("last_seen", *change.LastSeen).Msg("PeerChange received")
}