diff --git a/hscontrol/app.go b/hscontrol/app.go index 8ce1066f..f7d9ba90 100644 --- a/hscontrol/app.go +++ b/hscontrol/app.go @@ -67,6 +67,9 @@ var ( ) ) +// oidcProviderInitTimeout is the timeout for initializing the OIDC provider. +const oidcProviderInitTimeout = 30 * time.Second + var ( debugDeadlock = envknob.Bool("HEADSCALE_DEBUG_DEADLOCK") debugDeadlockTimeout = envknob.RegisterDuration("HEADSCALE_DEBUG_DEADLOCK_TIMEOUT") @@ -161,7 +164,7 @@ func NewHeadscale(cfg *types.Config) (*Headscale, error) { authProvider = NewAuthProviderWeb(cfg.ServerURL) if cfg.OIDC.Issuer != "" { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), oidcProviderInitTimeout) defer cancel() oidcProvider, err := NewAuthProviderOIDC( diff --git a/hscontrol/db/sqliteconfig/config.go b/hscontrol/db/sqliteconfig/config.go index 23cb4b50..e6386718 100644 --- a/hscontrol/db/sqliteconfig/config.go +++ b/hscontrol/db/sqliteconfig/config.go @@ -22,6 +22,9 @@ var ( const ( // DefaultBusyTimeout is the default busy timeout in milliseconds. DefaultBusyTimeout = 10000 + // DefaultWALAutocheckpoint is the default WAL autocheckpoint value (number of pages). + // SQLite default is 1000 pages. + DefaultWALAutocheckpoint = 1000 ) // JournalMode represents SQLite journal_mode pragma values. @@ -310,7 +313,7 @@ func Default(path string) *Config { BusyTimeout: DefaultBusyTimeout, JournalMode: JournalModeWAL, AutoVacuum: AutoVacuumIncremental, - WALAutocheckpoint: 1000, + WALAutocheckpoint: DefaultWALAutocheckpoint, Synchronous: SynchronousNormal, ForeignKeys: true, TxLock: TxLockImmediate, diff --git a/hscontrol/mapper/batcher.go b/hscontrol/mapper/batcher.go index c1349f75..c5bbda48 100644 --- a/hscontrol/mapper/batcher.go +++ b/hscontrol/mapper/batcher.go @@ -22,6 +22,10 @@ var ( ErrNodeConnectionNil = errors.New("nodeConnection is nil") ) +// workChannelMultiplier is the multiplier for work channel capacity based on worker count. +// The size is arbitrary chosen, the sizing should be revisited. +const workChannelMultiplier = 200 + var mapResponseGenerated = promauto.NewCounterVec(prometheus.CounterOpts{ Namespace: "headscale", Name: "mapresponse_generated_total", @@ -49,8 +53,7 @@ func NewBatcher(batchTime time.Duration, workers int, mapper *mapper) *LockFreeB workers: workers, tick: time.NewTicker(batchTime), - // The size of this channel is arbitrary chosen, the sizing should be revisited. - workCh: make(chan work, workers*200), + workCh: make(chan work, workers*workChannelMultiplier), nodes: xsync.NewMap[types.NodeID, *multiChannelNodeConn](), connected: xsync.NewMap[types.NodeID, *time.Time](), pendingChanges: xsync.NewMap[types.NodeID, []change.Change](), diff --git a/hscontrol/mapper/batcher_lockfree.go b/hscontrol/mapper/batcher_lockfree.go index 988f0b35..2580b7f4 100644 --- a/hscontrol/mapper/batcher_lockfree.go +++ b/hscontrol/mapper/batcher_lockfree.go @@ -25,6 +25,25 @@ var ( ErrConnectionTimeout = errors.New("connection timeout sending to channel (likely stale connection)") ) +// Batcher configuration constants. +const ( + // initialMapSendTimeout is the timeout for sending the initial map response to a new connection. + initialMapSendTimeout = 5 * time.Second + + // offlineNodeCleanupThreshold is how long a node must be offline before it's cleaned up. + offlineNodeCleanupThreshold = 15 * time.Minute + + // offlineNodeCleanupInterval is the interval between cleanup runs. + offlineNodeCleanupInterval = 5 * time.Minute + + // connectionSendTimeout is the timeout for detecting stale connections. + // Kept short to quickly detect Docker containers that are forcefully terminated. + connectionSendTimeout = 50 * time.Millisecond + + // connectionIDBytes is the number of random bytes used for connection IDs. + connectionIDBytes = 8 +) + // LockFreeBatcher uses atomic operations and concurrent maps to eliminate mutex contention. type LockFreeBatcher struct { tick *time.Ticker @@ -94,9 +113,9 @@ func (b *LockFreeBatcher) AddNode(id types.NodeID, c chan<- *tailcfg.MapResponse select { case c <- initialMap: // Success - case <-time.After(5 * time.Second): + case <-time.After(initialMapSendTimeout): log.Error().Uint64("node.id", id.Uint64()).Err(ErrInitialMapTimeout).Msg("Initial map send timeout") - log.Debug().Caller().Uint64("node.id", id.Uint64()).Dur("timeout.duration", 5*time.Second). + log.Debug().Caller().Uint64("node.id", id.Uint64()).Dur("timeout.duration", initialMapSendTimeout). Msg("Initial map send timed out because channel was blocked or receiver not ready") nodeConn.removeConnectionByChannel(c) @@ -187,7 +206,7 @@ func (b *LockFreeBatcher) doWork() { } // Create a cleanup ticker for removing truly disconnected nodes - cleanupTicker := time.NewTicker(5 * time.Minute) + cleanupTicker := time.NewTicker(offlineNodeCleanupInterval) defer cleanupTicker.Stop() for { @@ -395,14 +414,13 @@ func (b *LockFreeBatcher) processBatchedChanges() { // cleanupOfflineNodes removes nodes that have been offline for too long to prevent memory leaks. // TODO(kradalby): reevaluate if we want to keep this. func (b *LockFreeBatcher) cleanupOfflineNodes() { - cleanupThreshold := 15 * time.Minute now := time.Now() var nodesToCleanup []types.NodeID // Find nodes that have been offline for too long b.connected.Range(func(nodeID types.NodeID, disconnectTime *time.Time) bool { - if disconnectTime != nil && now.Sub(*disconnectTime) > cleanupThreshold { + if disconnectTime != nil && now.Sub(*disconnectTime) > offlineNodeCleanupThreshold { // Double-check the node doesn't have active connections if nodeConn, exists := b.nodes.Load(nodeID); exists { if !nodeConn.hasActiveConnections() { @@ -417,7 +435,7 @@ func (b *LockFreeBatcher) cleanupOfflineNodes() { // Clean up the identified nodes for _, nodeID := range nodesToCleanup { log.Info().Uint64("node.id", nodeID.Uint64()). - Dur("offline_duration", cleanupThreshold). + Dur("offline_duration", offlineNodeCleanupThreshold). Msg("Cleaning up node that has been offline for too long") b.nodes.Delete(nodeID) @@ -532,7 +550,7 @@ type multiChannelNodeConn struct { // generateConnectionID generates a unique connection identifier. func generateConnectionID() string { - bytes := make([]byte, 8) + bytes := make([]byte, connectionIDBytes) _, _ = rand.Read(bytes) return hex.EncodeToString(bytes) @@ -711,7 +729,7 @@ func (entry *connectionEntry) send(data *tailcfg.MapResponse) error { // Update last used timestamp on successful send entry.lastUsed.Store(time.Now().Unix()) return nil - case <-time.After(50 * time.Millisecond): + case <-time.After(connectionSendTimeout): // Connection is likely stale - client isn't reading from channel // This catches the case where Docker containers are killed but channels remain open return fmt.Errorf("%w: connection %s", ErrConnectionTimeout, entry.id) diff --git a/hscontrol/oidc.go b/hscontrol/oidc.go index 836e8763..6ab70f78 100644 --- a/hscontrol/oidc.go +++ b/hscontrol/oidc.go @@ -28,6 +28,7 @@ const ( defaultOAuthOptionsCount = 3 registerCacheExpiration = time.Minute * 15 registerCacheCleanup = time.Minute * 20 + csrfTokenLength = 64 ) var ( @@ -614,7 +615,7 @@ func getCookieName(baseName, value string) string { } func setCSRFCookie(w http.ResponseWriter, r *http.Request, name string) (string, error) { - val, err := util.GenerateRandomStringURLSafe(64) + val, err := util.GenerateRandomStringURLSafe(csrfTokenLength) if err != nil { return val, err } diff --git a/hscontrol/types/config.go b/hscontrol/types/config.go index e947e104..5a32147d 100644 --- a/hscontrol/types/config.go +++ b/hscontrol/types/config.go @@ -29,7 +29,8 @@ const ( PKCEMethodPlain string = "plain" PKCEMethodS256 string = "S256" - defaultNodeStoreBatchSize = 100 + defaultNodeStoreBatchSize = 100 + defaultWALAutocheckpoint = 1000 // SQLite default ) var ( @@ -380,7 +381,7 @@ func LoadConfig(path string, isFile bool) error { viper.SetDefault("database.postgres.conn_max_idle_time_secs", 3600) viper.SetDefault("database.sqlite.write_ahead_log", true) - viper.SetDefault("database.sqlite.wal_autocheckpoint", 1000) // SQLite default + viper.SetDefault("database.sqlite.wal_autocheckpoint", defaultWALAutocheckpoint) viper.SetDefault("oidc.scope", []string{oidc.ScopeOpenID, "profile", "email"}) viper.SetDefault("oidc.only_start_if_oidc_is_available", true)