headscale/cmd/hi/cleanup.go
Kristoffer Dalby b36438bf90 all: disable thelper linter and clean up nolint comments
Disable the thelper linter which triggers on inline test closures in
table-driven tests. These closures are intentionally not standalone
helpers and don't benefit from t.Helper().

Also remove explanatory comments from nolint directives throughout the
codebase as they add noise without providing significant value.
2026-01-21 15:56:57 +00:00

438 lines
11 KiB
Go

package main
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"strings"
"time"
"github.com/cenkalti/backoff/v5"
cerrdefs "github.com/containerd/errdefs"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/image"
"github.com/docker/docker/client"
)
// cleanupBeforeTest performs cleanup operations before running tests.
// Only removes stale (stopped/exited) test containers to avoid interfering with concurrent test runs.
func cleanupBeforeTest(ctx context.Context) error {
err := cleanupStaleTestContainers(ctx)
if err != nil {
return fmt.Errorf("failed to clean stale test containers: %w", err)
}
//nolint:noinlineerr
if err := pruneDockerNetworks(ctx); err != nil {
return fmt.Errorf("failed to prune networks: %w", err)
}
return nil
}
// cleanupAfterTest removes the test container and all associated integration test containers for the run.
func cleanupAfterTest(ctx context.Context, cli *client.Client, containerID, runID string) error {
// Remove the main test container
err := cli.ContainerRemove(ctx, containerID, container.RemoveOptions{
Force: true,
})
if err != nil {
return fmt.Errorf("failed to remove test container: %w", err)
}
// Clean up integration test containers for this run only
if runID != "" {
err := killTestContainersByRunID(ctx, runID)
if err != nil {
return fmt.Errorf("failed to clean up containers for run %s: %w", runID, err)
}
}
return nil
}
// killTestContainers terminates and removes all test containers.
func killTestContainers(ctx context.Context) error {
//nolint:contextcheck
cli, err := createDockerClient()
if err != nil {
return fmt.Errorf("failed to create Docker client: %w", err)
}
defer cli.Close()
containers, err := cli.ContainerList(ctx, container.ListOptions{
All: true,
})
if err != nil {
return fmt.Errorf("failed to list containers: %w", err)
}
removed := 0
for _, cont := range containers {
shouldRemove := false
for _, name := range cont.Names {
if strings.Contains(name, "headscale-test-suite") ||
strings.Contains(name, "hs-") ||
strings.Contains(name, "ts-") ||
strings.Contains(name, "derp-") {
shouldRemove = true
break
}
}
if shouldRemove {
// First kill the container if it's running
if cont.State == "running" {
_ = cli.ContainerKill(ctx, cont.ID, "KILL")
}
// Then remove the container with retry logic
if removeContainerWithRetry(ctx, cli, cont.ID) {
removed++
}
}
}
if removed > 0 {
fmt.Printf("Removed %d test containers\n", removed)
} else {
fmt.Println("No test containers found to remove")
}
return nil
}
// killTestContainersByRunID terminates and removes all test containers for a specific run ID.
// This function filters containers by the hi.run-id label to only affect containers
// belonging to the specified test run, leaving other concurrent test runs untouched.
func killTestContainersByRunID(ctx context.Context, runID string) error {
//nolint:contextcheck
cli, err := createDockerClient()
if err != nil {
return fmt.Errorf("failed to create Docker client: %w", err)
}
defer cli.Close()
// Filter containers by hi.run-id label
containers, err := cli.ContainerList(ctx, container.ListOptions{
All: true,
Filters: filters.NewArgs(
filters.Arg("label", "hi.run-id="+runID),
),
})
if err != nil {
return fmt.Errorf("failed to list containers for run %s: %w", runID, err)
}
removed := 0
for _, cont := range containers {
// Kill the container if it's running
if cont.State == "running" {
_ = cli.ContainerKill(ctx, cont.ID, "KILL")
}
// Remove the container with retry logic
if removeContainerWithRetry(ctx, cli, cont.ID) {
removed++
}
}
if removed > 0 {
fmt.Printf("Removed %d containers for run ID %s\n", removed, runID)
}
return nil
}
// cleanupStaleTestContainers removes stopped/exited test containers without affecting running tests.
// This is useful for cleaning up leftover containers from previous crashed or interrupted test runs
// without interfering with currently running concurrent tests.
func cleanupStaleTestContainers(ctx context.Context) error {
//nolint:contextcheck
cli, err := createDockerClient()
if err != nil {
return fmt.Errorf("failed to create Docker client: %w", err)
}
defer cli.Close()
// Only get stopped/exited containers
containers, err := cli.ContainerList(ctx, container.ListOptions{
All: true,
Filters: filters.NewArgs(
filters.Arg("status", "exited"),
filters.Arg("status", "dead"),
),
})
if err != nil {
return fmt.Errorf("failed to list stopped containers: %w", err)
}
removed := 0
for _, cont := range containers {
// Only remove containers that look like test containers
shouldRemove := false
for _, name := range cont.Names {
if strings.Contains(name, "headscale-test-suite") ||
strings.Contains(name, "hs-") ||
strings.Contains(name, "ts-") ||
strings.Contains(name, "derp-") {
shouldRemove = true
break
}
}
if shouldRemove {
if removeContainerWithRetry(ctx, cli, cont.ID) {
removed++
}
}
}
if removed > 0 {
fmt.Printf("Removed %d stale test containers\n", removed)
}
return nil
}
const (
containerRemoveInitialInterval = 100 * time.Millisecond
containerRemoveMaxElapsedTime = 2 * time.Second
)
// removeContainerWithRetry attempts to remove a container with exponential backoff retry logic.
func removeContainerWithRetry(ctx context.Context, cli *client.Client, containerID string) bool {
expBackoff := backoff.NewExponentialBackOff()
expBackoff.InitialInterval = containerRemoveInitialInterval
_, err := backoff.Retry(ctx, func() (struct{}, error) {
err := cli.ContainerRemove(ctx, containerID, container.RemoveOptions{
Force: true,
})
if err != nil {
return struct{}{}, err
}
return struct{}{}, nil
}, backoff.WithBackOff(expBackoff), backoff.WithMaxElapsedTime(containerRemoveMaxElapsedTime))
return err == nil
}
// pruneDockerNetworks removes unused Docker networks.
func pruneDockerNetworks(ctx context.Context) error {
//nolint:contextcheck
cli, err := createDockerClient()
if err != nil {
return fmt.Errorf("failed to create Docker client: %w", err)
}
defer cli.Close()
report, err := cli.NetworksPrune(ctx, filters.Args{})
if err != nil {
return fmt.Errorf("failed to prune networks: %w", err)
}
if len(report.NetworksDeleted) > 0 {
fmt.Printf("Removed %d unused networks\n", len(report.NetworksDeleted))
} else {
fmt.Println("No unused networks found to remove")
}
return nil
}
// cleanOldImages removes test-related and old dangling Docker images.
func cleanOldImages(ctx context.Context) error {
//nolint:contextcheck
cli, err := createDockerClient()
if err != nil {
return fmt.Errorf("failed to create Docker client: %w", err)
}
defer cli.Close()
images, err := cli.ImageList(ctx, image.ListOptions{
All: true,
})
if err != nil {
return fmt.Errorf("failed to list images: %w", err)
}
removed := 0
for _, img := range images {
shouldRemove := false
for _, tag := range img.RepoTags {
if strings.Contains(tag, "hs-") ||
strings.Contains(tag, "headscale-integration") ||
strings.Contains(tag, "tailscale") {
shouldRemove = true
break
}
}
if len(img.RepoTags) == 0 && time.Unix(img.Created, 0).Before(time.Now().Add(-7*24*time.Hour)) {
shouldRemove = true
}
if shouldRemove {
_, err := cli.ImageRemove(ctx, img.ID, image.RemoveOptions{
Force: true,
})
if err == nil {
removed++
}
}
}
if removed > 0 {
fmt.Printf("Removed %d test images\n", removed)
} else {
fmt.Println("No test images found to remove")
}
return nil
}
// cleanCacheVolume removes the Docker volume used for Go module cache.
func cleanCacheVolume(ctx context.Context) error {
//nolint:contextcheck
cli, err := createDockerClient()
if err != nil {
return fmt.Errorf("failed to create Docker client: %w", err)
}
defer cli.Close()
volumeName := "hs-integration-go-cache"
err = cli.VolumeRemove(ctx, volumeName, true)
if err != nil {
if cerrdefs.IsNotFound(err) {
fmt.Printf("Go module cache volume not found: %s\n", volumeName)
} else if cerrdefs.IsConflict(err) {
fmt.Printf("Go module cache volume is in use and cannot be removed: %s\n", volumeName)
} else {
fmt.Printf("Failed to remove Go module cache volume %s: %v\n", volumeName, err)
}
} else {
fmt.Printf("Removed Go module cache volume: %s\n", volumeName)
}
return nil
}
// cleanupSuccessfulTestArtifacts removes artifacts from successful test runs to save disk space.
// This function removes large artifacts that are mainly useful for debugging failures:
// - Database dumps (.db files)
// - Profile data (pprof directories)
// - MapResponse data (mapresponses directories)
// - Prometheus metrics files
//
// It preserves:
// - Log files (.log) which are small and useful for verification.
func cleanupSuccessfulTestArtifacts(logsDir string, verbose bool) error {
entries, err := os.ReadDir(logsDir)
if err != nil {
return fmt.Errorf("failed to read logs directory: %w", err)
}
var (
removedFiles, removedDirs int
totalSize int64
)
for _, entry := range entries {
name := entry.Name()
fullPath := filepath.Join(logsDir, name)
if entry.IsDir() {
// Remove pprof and mapresponses directories (typically large)
// These directories contain artifacts from all containers in the test run
if name == "pprof" || name == "mapresponses" {
size, sizeErr := getDirSize(fullPath)
if sizeErr == nil {
totalSize += size
}
err := os.RemoveAll(fullPath)
if err != nil {
if verbose {
log.Printf("Warning: failed to remove directory %s: %v", name, err)
}
} else {
removedDirs++
if verbose {
log.Printf("Removed directory: %s/", name)
}
}
}
} else {
// Only process test-related files (headscale and tailscale)
if !strings.HasPrefix(name, "hs-") && !strings.HasPrefix(name, "ts-") {
continue
}
// Remove database, metrics, and status files, but keep logs
shouldRemove := strings.HasSuffix(name, ".db") ||
strings.HasSuffix(name, "_metrics.txt") ||
strings.HasSuffix(name, "_status.json")
if shouldRemove {
info, infoErr := entry.Info()
if infoErr == nil {
totalSize += info.Size()
}
err := os.Remove(fullPath)
if err != nil {
if verbose {
log.Printf("Warning: failed to remove file %s: %v", name, err)
}
} else {
removedFiles++
if verbose {
log.Printf("Removed file: %s", name)
}
}
}
}
}
if removedFiles > 0 || removedDirs > 0 {
const bytesPerMB = 1024 * 1024
log.Printf("Cleaned up %d files and %d directories (freed ~%.2f MB)",
removedFiles, removedDirs, float64(totalSize)/bytesPerMB)
}
return nil
}
// getDirSize calculates the total size of a directory.
func getDirSize(path string) (int64, error) {
var size int64
err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
size += info.Size()
}
return nil
})
return size, err
}