hscontrol: extract magic numbers to named constants

Define named constants for magic numbers to improve code clarity:
- Batcher timeouts and intervals (initialMapSendTimeout, etc.)
- Work channel multiplier
- OIDC provider init timeout
- CSRF token length
- SQLite WAL autocheckpoint default
This commit is contained in:
Kristoffer Dalby 2026-01-20 16:11:39 +00:00
parent f969745db4
commit 1462847878
6 changed files with 44 additions and 15 deletions

View file

@ -67,6 +67,9 @@ var (
)
)
// oidcProviderInitTimeout is the timeout for initializing the OIDC provider.
const oidcProviderInitTimeout = 30 * time.Second
var (
debugDeadlock = envknob.Bool("HEADSCALE_DEBUG_DEADLOCK")
debugDeadlockTimeout = envknob.RegisterDuration("HEADSCALE_DEBUG_DEADLOCK_TIMEOUT")
@ -161,7 +164,7 @@ func NewHeadscale(cfg *types.Config) (*Headscale, error) {
authProvider = NewAuthProviderWeb(cfg.ServerURL)
if cfg.OIDC.Issuer != "" {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), oidcProviderInitTimeout)
defer cancel()
oidcProvider, err := NewAuthProviderOIDC(

View file

@ -22,6 +22,9 @@ var (
const (
// DefaultBusyTimeout is the default busy timeout in milliseconds.
DefaultBusyTimeout = 10000
// DefaultWALAutocheckpoint is the default WAL autocheckpoint value (number of pages).
// SQLite default is 1000 pages.
DefaultWALAutocheckpoint = 1000
)
// JournalMode represents SQLite journal_mode pragma values.
@ -310,7 +313,7 @@ func Default(path string) *Config {
BusyTimeout: DefaultBusyTimeout,
JournalMode: JournalModeWAL,
AutoVacuum: AutoVacuumIncremental,
WALAutocheckpoint: 1000,
WALAutocheckpoint: DefaultWALAutocheckpoint,
Synchronous: SynchronousNormal,
ForeignKeys: true,
TxLock: TxLockImmediate,

View file

@ -22,6 +22,10 @@ var (
ErrNodeConnectionNil = errors.New("nodeConnection is nil")
)
// workChannelMultiplier is the multiplier for work channel capacity based on worker count.
// The size is arbitrary chosen, the sizing should be revisited.
const workChannelMultiplier = 200
var mapResponseGenerated = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "headscale",
Name: "mapresponse_generated_total",
@ -49,8 +53,7 @@ func NewBatcher(batchTime time.Duration, workers int, mapper *mapper) *LockFreeB
workers: workers,
tick: time.NewTicker(batchTime),
// The size of this channel is arbitrary chosen, the sizing should be revisited.
workCh: make(chan work, workers*200),
workCh: make(chan work, workers*workChannelMultiplier),
nodes: xsync.NewMap[types.NodeID, *multiChannelNodeConn](),
connected: xsync.NewMap[types.NodeID, *time.Time](),
pendingChanges: xsync.NewMap[types.NodeID, []change.Change](),

View file

@ -25,6 +25,25 @@ var (
ErrConnectionTimeout = errors.New("connection timeout sending to channel (likely stale connection)")
)
// Batcher configuration constants.
const (
// initialMapSendTimeout is the timeout for sending the initial map response to a new connection.
initialMapSendTimeout = 5 * time.Second
// offlineNodeCleanupThreshold is how long a node must be offline before it's cleaned up.
offlineNodeCleanupThreshold = 15 * time.Minute
// offlineNodeCleanupInterval is the interval between cleanup runs.
offlineNodeCleanupInterval = 5 * time.Minute
// connectionSendTimeout is the timeout for detecting stale connections.
// Kept short to quickly detect Docker containers that are forcefully terminated.
connectionSendTimeout = 50 * time.Millisecond
// connectionIDBytes is the number of random bytes used for connection IDs.
connectionIDBytes = 8
)
// LockFreeBatcher uses atomic operations and concurrent maps to eliminate mutex contention.
type LockFreeBatcher struct {
tick *time.Ticker
@ -94,9 +113,9 @@ func (b *LockFreeBatcher) AddNode(id types.NodeID, c chan<- *tailcfg.MapResponse
select {
case c <- initialMap:
// Success
case <-time.After(5 * time.Second):
case <-time.After(initialMapSendTimeout):
log.Error().Uint64("node.id", id.Uint64()).Err(ErrInitialMapTimeout).Msg("Initial map send timeout")
log.Debug().Caller().Uint64("node.id", id.Uint64()).Dur("timeout.duration", 5*time.Second).
log.Debug().Caller().Uint64("node.id", id.Uint64()).Dur("timeout.duration", initialMapSendTimeout).
Msg("Initial map send timed out because channel was blocked or receiver not ready")
nodeConn.removeConnectionByChannel(c)
@ -187,7 +206,7 @@ func (b *LockFreeBatcher) doWork() {
}
// Create a cleanup ticker for removing truly disconnected nodes
cleanupTicker := time.NewTicker(5 * time.Minute)
cleanupTicker := time.NewTicker(offlineNodeCleanupInterval)
defer cleanupTicker.Stop()
for {
@ -395,14 +414,13 @@ func (b *LockFreeBatcher) processBatchedChanges() {
// cleanupOfflineNodes removes nodes that have been offline for too long to prevent memory leaks.
// TODO(kradalby): reevaluate if we want to keep this.
func (b *LockFreeBatcher) cleanupOfflineNodes() {
cleanupThreshold := 15 * time.Minute
now := time.Now()
var nodesToCleanup []types.NodeID
// Find nodes that have been offline for too long
b.connected.Range(func(nodeID types.NodeID, disconnectTime *time.Time) bool {
if disconnectTime != nil && now.Sub(*disconnectTime) > cleanupThreshold {
if disconnectTime != nil && now.Sub(*disconnectTime) > offlineNodeCleanupThreshold {
// Double-check the node doesn't have active connections
if nodeConn, exists := b.nodes.Load(nodeID); exists {
if !nodeConn.hasActiveConnections() {
@ -417,7 +435,7 @@ func (b *LockFreeBatcher) cleanupOfflineNodes() {
// Clean up the identified nodes
for _, nodeID := range nodesToCleanup {
log.Info().Uint64("node.id", nodeID.Uint64()).
Dur("offline_duration", cleanupThreshold).
Dur("offline_duration", offlineNodeCleanupThreshold).
Msg("Cleaning up node that has been offline for too long")
b.nodes.Delete(nodeID)
@ -532,7 +550,7 @@ type multiChannelNodeConn struct {
// generateConnectionID generates a unique connection identifier.
func generateConnectionID() string {
bytes := make([]byte, 8)
bytes := make([]byte, connectionIDBytes)
_, _ = rand.Read(bytes)
return hex.EncodeToString(bytes)
@ -711,7 +729,7 @@ func (entry *connectionEntry) send(data *tailcfg.MapResponse) error {
// Update last used timestamp on successful send
entry.lastUsed.Store(time.Now().Unix())
return nil
case <-time.After(50 * time.Millisecond):
case <-time.After(connectionSendTimeout):
// Connection is likely stale - client isn't reading from channel
// This catches the case where Docker containers are killed but channels remain open
return fmt.Errorf("%w: connection %s", ErrConnectionTimeout, entry.id)

View file

@ -28,6 +28,7 @@ const (
defaultOAuthOptionsCount = 3
registerCacheExpiration = time.Minute * 15
registerCacheCleanup = time.Minute * 20
csrfTokenLength = 64
)
var (
@ -614,7 +615,7 @@ func getCookieName(baseName, value string) string {
}
func setCSRFCookie(w http.ResponseWriter, r *http.Request, name string) (string, error) {
val, err := util.GenerateRandomStringURLSafe(64)
val, err := util.GenerateRandomStringURLSafe(csrfTokenLength)
if err != nil {
return val, err
}

View file

@ -29,7 +29,8 @@ const (
PKCEMethodPlain string = "plain"
PKCEMethodS256 string = "S256"
defaultNodeStoreBatchSize = 100
defaultNodeStoreBatchSize = 100
defaultWALAutocheckpoint = 1000 // SQLite default
)
var (
@ -380,7 +381,7 @@ func LoadConfig(path string, isFile bool) error {
viper.SetDefault("database.postgres.conn_max_idle_time_secs", 3600)
viper.SetDefault("database.sqlite.write_ahead_log", true)
viper.SetDefault("database.sqlite.wal_autocheckpoint", 1000) // SQLite default
viper.SetDefault("database.sqlite.wal_autocheckpoint", defaultWALAutocheckpoint)
viper.SetDefault("oidc.scope", []string{oidc.ScopeOpenID, "profile", "email"})
viper.SetDefault("oidc.only_start_if_oidc_is_available", true)