Setup: Update Ollama service examples in compose.yaml files #5123

Signed-off-by: Michael Mayer <michael@photoprism.app>
2026-01-23 02:24:24 +00:00 · 2025-09-01 13:07:22 +02:00 · 2025-09-01 13:07:22 +02:00 · 19fff8b0bf
commit 19fff8b0bf
parent fccdc50e6e
5 changed files with 15 additions and 16 deletions
--- a/compose.nvidia.yaml
+++ b/compose.nvidia.yaml
@ -173,15 +173,15 @@ services:
      OLLAMA_NUM_PARALLEL: "1"         # maximum number of parallel requests
      OLLAMA_MAX_LOADED_MODELS: "1"    # maximum number of loaded models per GPU
      OLLAMA_LOAD_TIMEOUT: "5m"        # maximum time for loading models (default "5m")
-      OLLAMA_KEEP_ALIVE: "15m"         # duration that models stay loaded in memory (default "5m")
+      OLLAMA_KEEP_ALIVE: "5m"          # duration that models stay loaded in memory (default "5m")
      OLLAMA_CONTEXT_LENGTH: "4096"    # maximum input context length
      OLLAMA_MULTIUSER_CACHE: "false"  # optimize prompt caching for multi-user scenarios
-      OLLAMA_NOPRUNE: "true"           # disables pruning of model blobs at startup
+      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
-      OLLAMA_NEW_ENGINE: "false"       # enables the new Ollama engine
+      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
      # OLLAMA_INTEL_GPU: "true"         # enables experimental Intel GPU detection
      ## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):
--- a/compose.yaml
+++ b/compose.yaml
@ -253,15 +253,15 @@ services:
      OLLAMA_NUM_PARALLEL: "1"         # maximum number of parallel requests
      OLLAMA_MAX_LOADED_MODELS: "1"    # maximum number of loaded models per GPU
      OLLAMA_LOAD_TIMEOUT: "5m"        # maximum time for loading models (default "5m")
-      OLLAMA_KEEP_ALIVE: "15m"         # duration that models stay loaded in memory (default "5m")
+      OLLAMA_KEEP_ALIVE: "5m"          # duration that models stay loaded in memory (default "5m")
      OLLAMA_CONTEXT_LENGTH: "4096"    # maximum input context length
      OLLAMA_MULTIUSER_CACHE: "false"  # optimize prompt caching for multi-user scenarios
-      OLLAMA_NOPRUNE: "true"           # disables pruning of model blobs at startup
+      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
-      OLLAMA_NEW_ENGINE: "false"       # enables the new Ollama engine
+      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
      # OLLAMA_INTEL_GPU: "true"         # enables experimental Intel GPU detection
      ## NVIDIA GPU Hardware Acceleration (optional):
--- a/setup/docker/arm64/compose.yaml
+++ b/setup/docker/arm64/compose.yaml
@ -175,15 +175,15 @@ services:
      OLLAMA_NUM_PARALLEL: "1"         # maximum number of parallel requests
      OLLAMA_MAX_LOADED_MODELS: "1"    # maximum number of loaded models per GPU
      OLLAMA_LOAD_TIMEOUT: "5m"        # maximum time for loading models (default "5m")
-      OLLAMA_KEEP_ALIVE: "15m"         # duration that models stay loaded in memory (default "5m")
+      OLLAMA_KEEP_ALIVE: "5m"          # duration that models stay loaded in memory (default "5m")
      OLLAMA_CONTEXT_LENGTH: "4096"    # maximum input context length
      OLLAMA_MULTIUSER_CACHE: "false"  # optimize prompt caching for multi-user scenarios
-      OLLAMA_NOPRUNE: "true"           # disables pruning of model blobs at startup
+      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
-      OLLAMA_NEW_ENGINE: "false"       # enables the new Ollama engine
+      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
      # OLLAMA_INTEL_GPU: "true"         # enables experimental Intel GPU detection
      ## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):
--- a/setup/docker/compose.yaml
+++ b/setup/docker/compose.yaml
@ -180,15 +180,15 @@ services:
      OLLAMA_NUM_PARALLEL: "1"         # maximum number of parallel requests
      OLLAMA_MAX_LOADED_MODELS: "1"    # maximum number of loaded models per GPU
      OLLAMA_LOAD_TIMEOUT: "5m"        # maximum time for loading models (default "5m")
-      OLLAMA_KEEP_ALIVE: "15m"         # duration that models stay loaded in memory (default "5m")
+      OLLAMA_KEEP_ALIVE: "5m"          # duration that models stay loaded in memory (default "5m")
      OLLAMA_CONTEXT_LENGTH: "4096"    # maximum input context length
      OLLAMA_MULTIUSER_CACHE: "false"  # optimize prompt caching for multi-user scenarios
-      OLLAMA_NOPRUNE: "true"           # disables pruning of model blobs at startup
+      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
-      OLLAMA_NEW_ENGINE: "false"       # enables the new Ollama engine
+      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
      # OLLAMA_INTEL_GPU: "true"         # enables experimental Intel GPU detection
      ## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):
--- a/setup/docker/nvidia/compose.yaml
+++ b/setup/docker/nvidia/compose.yaml
@ -180,16 +180,15 @@ services:
      OLLAMA_NUM_PARALLEL: "1"         # maximum number of parallel requests
      OLLAMA_MAX_LOADED_MODELS: "1"    # maximum number of loaded models per GPU
      OLLAMA_LOAD_TIMEOUT: "5m"        # maximum time for loading models (default "5m")
-      OLLAMA_KEEP_ALIVE: "15m"         # duration that models stay loaded in memory (default "5m")
+      OLLAMA_KEEP_ALIVE: "5m"          # duration that models stay loaded in memory (default "5m")
      OLLAMA_CONTEXT_LENGTH: "4096"    # maximum input context length
      OLLAMA_MULTIUSER_CACHE: "false"  # optimize prompt caching for multi-user scenarios
-      OLLAMA_NOPRUNE: "true"           # disables pruning of model blobs at startup
+      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
-      OLLAMA_INTEL_GPU: "false"        # enables experimental Intel GPU detection
-      OLLAMA_NEW_ENGINE: "false"       # enables the new Ollama engine
+      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
      # OLLAMA_INTEL_GPU: "true"         # enables experimental Intel GPU detection
      ## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):