Setup: Update Ollama service examples in compose.yaml files #5123

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer 2025-09-01 13:07:22 +02:00
parent fccdc50e6e
commit 19fff8b0bf
5 changed files with 15 additions and 16 deletions

View file

@ -173,15 +173,15 @@ services:
OLLAMA_NUM_PARALLEL: "1" # maximum number of parallel requests
OLLAMA_MAX_LOADED_MODELS: "1" # maximum number of loaded models per GPU
OLLAMA_LOAD_TIMEOUT: "5m" # maximum time for loading models (default "5m")
OLLAMA_KEEP_ALIVE: "15m" # duration that models stay loaded in memory (default "5m")
OLLAMA_KEEP_ALIVE: "5m" # duration that models stay loaded in memory (default "5m")
OLLAMA_CONTEXT_LENGTH: "4096" # maximum input context length
OLLAMA_MULTIUSER_CACHE: "false" # optimize prompt caching for multi-user scenarios
OLLAMA_NOPRUNE: "true" # disables pruning of model blobs at startup
OLLAMA_NOPRUNE: "false" # disables pruning of model blobs at startup
OLLAMA_NOHISTORY: "true" # disables readline history
OLLAMA_FLASH_ATTENTION: "false" # enables the experimental flash attention feature
OLLAMA_KV_CACHE_TYPE: "f16" # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
OLLAMA_SCHED_SPREAD: "false" # allows scheduling models across all GPUs.
OLLAMA_NEW_ENGINE: "false" # enables the new Ollama engine
OLLAMA_NEW_ENGINE: "true" # enables the new Ollama engine
# OLLAMA_DEBUG: "true" # shows additional debug information
# OLLAMA_INTEL_GPU: "true" # enables experimental Intel GPU detection
## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):

View file

@ -253,15 +253,15 @@ services:
OLLAMA_NUM_PARALLEL: "1" # maximum number of parallel requests
OLLAMA_MAX_LOADED_MODELS: "1" # maximum number of loaded models per GPU
OLLAMA_LOAD_TIMEOUT: "5m" # maximum time for loading models (default "5m")
OLLAMA_KEEP_ALIVE: "15m" # duration that models stay loaded in memory (default "5m")
OLLAMA_KEEP_ALIVE: "5m" # duration that models stay loaded in memory (default "5m")
OLLAMA_CONTEXT_LENGTH: "4096" # maximum input context length
OLLAMA_MULTIUSER_CACHE: "false" # optimize prompt caching for multi-user scenarios
OLLAMA_NOPRUNE: "true" # disables pruning of model blobs at startup
OLLAMA_NOPRUNE: "false" # disables pruning of model blobs at startup
OLLAMA_NOHISTORY: "true" # disables readline history
OLLAMA_FLASH_ATTENTION: "false" # enables the experimental flash attention feature
OLLAMA_KV_CACHE_TYPE: "f16" # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
OLLAMA_SCHED_SPREAD: "false" # allows scheduling models across all GPUs.
OLLAMA_NEW_ENGINE: "false" # enables the new Ollama engine
OLLAMA_NEW_ENGINE: "true" # enables the new Ollama engine
# OLLAMA_DEBUG: "true" # shows additional debug information
# OLLAMA_INTEL_GPU: "true" # enables experimental Intel GPU detection
## NVIDIA GPU Hardware Acceleration (optional):

View file

@ -175,15 +175,15 @@ services:
OLLAMA_NUM_PARALLEL: "1" # maximum number of parallel requests
OLLAMA_MAX_LOADED_MODELS: "1" # maximum number of loaded models per GPU
OLLAMA_LOAD_TIMEOUT: "5m" # maximum time for loading models (default "5m")
OLLAMA_KEEP_ALIVE: "15m" # duration that models stay loaded in memory (default "5m")
OLLAMA_KEEP_ALIVE: "5m" # duration that models stay loaded in memory (default "5m")
OLLAMA_CONTEXT_LENGTH: "4096" # maximum input context length
OLLAMA_MULTIUSER_CACHE: "false" # optimize prompt caching for multi-user scenarios
OLLAMA_NOPRUNE: "true" # disables pruning of model blobs at startup
OLLAMA_NOPRUNE: "false" # disables pruning of model blobs at startup
OLLAMA_NOHISTORY: "true" # disables readline history
OLLAMA_FLASH_ATTENTION: "false" # enables the experimental flash attention feature
OLLAMA_KV_CACHE_TYPE: "f16" # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
OLLAMA_SCHED_SPREAD: "false" # allows scheduling models across all GPUs.
OLLAMA_NEW_ENGINE: "false" # enables the new Ollama engine
OLLAMA_NEW_ENGINE: "true" # enables the new Ollama engine
# OLLAMA_DEBUG: "true" # shows additional debug information
# OLLAMA_INTEL_GPU: "true" # enables experimental Intel GPU detection
## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):

View file

@ -180,15 +180,15 @@ services:
OLLAMA_NUM_PARALLEL: "1" # maximum number of parallel requests
OLLAMA_MAX_LOADED_MODELS: "1" # maximum number of loaded models per GPU
OLLAMA_LOAD_TIMEOUT: "5m" # maximum time for loading models (default "5m")
OLLAMA_KEEP_ALIVE: "15m" # duration that models stay loaded in memory (default "5m")
OLLAMA_KEEP_ALIVE: "5m" # duration that models stay loaded in memory (default "5m")
OLLAMA_CONTEXT_LENGTH: "4096" # maximum input context length
OLLAMA_MULTIUSER_CACHE: "false" # optimize prompt caching for multi-user scenarios
OLLAMA_NOPRUNE: "true" # disables pruning of model blobs at startup
OLLAMA_NOPRUNE: "false" # disables pruning of model blobs at startup
OLLAMA_NOHISTORY: "true" # disables readline history
OLLAMA_FLASH_ATTENTION: "false" # enables the experimental flash attention feature
OLLAMA_KV_CACHE_TYPE: "f16" # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
OLLAMA_SCHED_SPREAD: "false" # allows scheduling models across all GPUs.
OLLAMA_NEW_ENGINE: "false" # enables the new Ollama engine
OLLAMA_NEW_ENGINE: "true" # enables the new Ollama engine
# OLLAMA_DEBUG: "true" # shows additional debug information
# OLLAMA_INTEL_GPU: "true" # enables experimental Intel GPU detection
## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):

View file

@ -180,16 +180,15 @@ services:
OLLAMA_NUM_PARALLEL: "1" # maximum number of parallel requests
OLLAMA_MAX_LOADED_MODELS: "1" # maximum number of loaded models per GPU
OLLAMA_LOAD_TIMEOUT: "5m" # maximum time for loading models (default "5m")
OLLAMA_KEEP_ALIVE: "15m" # duration that models stay loaded in memory (default "5m")
OLLAMA_KEEP_ALIVE: "5m" # duration that models stay loaded in memory (default "5m")
OLLAMA_CONTEXT_LENGTH: "4096" # maximum input context length
OLLAMA_MULTIUSER_CACHE: "false" # optimize prompt caching for multi-user scenarios
OLLAMA_NOPRUNE: "true" # disables pruning of model blobs at startup
OLLAMA_NOPRUNE: "false" # disables pruning of model blobs at startup
OLLAMA_NOHISTORY: "true" # disables readline history
OLLAMA_FLASH_ATTENTION: "false" # enables the experimental flash attention feature
OLLAMA_KV_CACHE_TYPE: "f16" # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
OLLAMA_SCHED_SPREAD: "false" # allows scheduling models across all GPUs.
OLLAMA_INTEL_GPU: "false" # enables experimental Intel GPU detection
OLLAMA_NEW_ENGINE: "false" # enables the new Ollama engine
OLLAMA_NEW_ENGINE: "true" # enables the new Ollama engine
# OLLAMA_DEBUG: "true" # shows additional debug information
# OLLAMA_INTEL_GPU: "true" # enables experimental Intel GPU detection
## NVIDIA GPU Hardware Acceleration (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html):