diff --git a/internal/ai/vision/README.md b/internal/ai/vision/README.md index 61644a4f8..0cb6c4405 100644 --- a/internal/ai/vision/README.md +++ b/internal/ai/vision/README.md @@ -1,6 +1,6 @@ ## PhotoPrism — Vision Package -**Last Updated:** November 25, 2025 +**Last Updated:** December 2, 2025 ### Overview @@ -51,20 +51,29 @@ The `vision.yml` file is usually kept in the `storage/config` directory (overrid #### Model Options -| Option | Default | Description | -|-------------------|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| -| `Temperature` | engine default (`0.1` for Ollama; unset for OpenAI) | Controls randomness; clamped to `[0,2]`. `gpt-5*` OpenAI models are forced to `0`. | -| `TopP` | engine default (`0.9` for some Ollama label defaults; unset for OpenAI) | Nucleus sampling parameter. | -| `MaxOutputTokens` | engine default (OpenAI caption 512, labels 1024; Ollama label default 256) | Upper bound on generated tokens; adapters raise low values to defaults. | -| `ForceJson` | engine-specific (`true` for OpenAI labels; `false` for Ollama labels; captions `false`) | Forces structured output when enabled. | -| `SchemaVersion` | derived from schema name | Override when coordinating schema migrations. | -| `Stop` | engine default | Array of stop sequences (e.g., `["\\n\\n"]`). | -| `NumThread` | runtime auto | Caps CPU threads for local engines. | -| `NumCtx` | engine default | Context window length (tokens). | +The model `Options` adjust model parameters such as temperature, top-p, and schema constraints when using [Ollama](ollama/README.md) or [OpenAI](openai/README.md): + +| Option | Default | Description | +|-------------------|-----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------| +| `Temperature` | engine default (`0.1` for Ollama) | Controls randomness with a value between `0.01` and `2.0`; not used for OpenAI's GPT-5. | +| `TopK` | engine default (model-specific) | Limits sampling to the top K tokens to reduce rare or noisy outputs. | +| `TopP` | engine default (`0.9` for some Ollama label defaults; unset for OpenAI) | Nucleus sampling; keeps the smallest token set whose cumulative probability ≥ `p`. | +| `MinP` | engine default (unset unless provided) | Drops tokens whose probability mass is below `p`, trimming the long tail. | +| `TypicalP` | engine default (unset unless provided) | Keeps tokens with typicality under the threshold; combine with TopP/MinP for flow. | +| `Seed` | random per run (unless set) | Fix for reproducible outputs; unset for more variety between runs. | +| `RepeatLastN` | engine default (model-specific) | Number of recent tokens considered for repetition penalties. | +| `RepeatPenalty` | engine default (model-specific) | Multiplier >1 discourages repeating the same tokens or phrases. | +| `NumPredict` | engine default (Ollama only) | Ollama-specific max output tokens; synonymous intent with `MaxOutputTokens`. | +| `MaxOutputTokens` | engine default (OpenAI caption 512, labels 1024) | Upper bound on generated tokens; adapters raise low values to defaults. | +| `ForceJson` | engine-specific (`true` for OpenAI labels; `false` for Ollama labels; captions `false`) | Forces structured output when enabled. | +| `SchemaVersion` | derived from schema name | Override when coordinating schema migrations. | +| `Stop` | engine default | Array of stop sequences (e.g., `["\\n\\n"]`). | +| `NumThread` | runtime auto | Caps CPU threads for local engines. | +| `NumCtx` | engine default | Context window length (tokens). | #### Model Service -Used for Ollama/OpenAI (and any future HTTP engines). All credentials and identifiers support `${ENV_VAR}` expansion. +Configures the endpoint URL, method, format, and authentication for [Ollama](ollama/README.md), [OpenAI](openai/README.md), and other engines that perform remote HTTP requests: | Field | Default | Notes | |------------------------------------|------------------------------------------|------------------------------------------------------| @@ -78,6 +87,8 @@ Used for Ollama/OpenAI (and any future HTTP engines). All credentials and identi | `FileScheme` | set by engine alias (`data` or `base64`) | Controls image transport. | | `Disabled` | `false` | Disable the endpoint without removing the model. | +> **Authentication:** All credentials and identifiers support `${ENV_VAR}` expansion. `Service.Key` sets `Authorization: Bearer `; `Username`/`Password` injects HTTP basic authentication into the service URI when it is not already present. + ### Field Behavior & Precedence - Model identifier resolution order: `Service.Model` → `Model` → `Name`. `Model.GetModel()` returns `(id, name, version)` where Ollama receives `name:version` and other engines receive `name` plus a separate `Version`. diff --git a/internal/ai/vision/api_request.go b/internal/ai/vision/api_request.go index 5f89ada7e..af990396e 100644 --- a/internal/ai/vision/api_request.go +++ b/internal/ai/vision/api_request.go @@ -32,43 +32,6 @@ const ( logDataTruncatedSuffix = "... (truncated)" ) -// ApiRequestOptions represents additional model parameters listed in the documentation. -type ApiRequestOptions struct { - NumKeep int `yaml:"NumKeep,omitempty" json:"num_keep,omitempty"` - Seed int `yaml:"Seed,omitempty" json:"seed,omitempty"` - NumPredict int `yaml:"NumPredict,omitempty" json:"num_predict,omitempty"` - TopK int `yaml:"TopK,omitempty" json:"top_k,omitempty"` - TopP float64 `yaml:"TopP,omitempty" json:"top_p,omitempty"` - MinP float64 `yaml:"MinP,omitempty" json:"min_p,omitempty"` - TfsZ float64 `yaml:"TfsZ,omitempty" json:"tfs_z,omitempty"` - TypicalP float64 `yaml:"TypicalP,omitempty" json:"typical_p,omitempty"` - RepeatLastN int `yaml:"RepeatLastN,omitempty" json:"repeat_last_n,omitempty"` - Temperature float64 `yaml:"Temperature,omitempty" json:"temperature,omitempty"` - RepeatPenalty float64 `yaml:"RepeatPenalty,omitempty" json:"repeat_penalty,omitempty"` - PresencePenalty float64 `yaml:"PresencePenalty,omitempty" json:"presence_penalty,omitempty"` - FrequencyPenalty float64 `yaml:"FrequencyPenalty,omitempty" json:"frequency_penalty,omitempty"` - Mirostat int `yaml:"Mirostat,omitempty" json:"mirostat,omitempty"` - MirostatTau float64 `yaml:"MirostatTau,omitempty" json:"mirostat_tau,omitempty"` - MirostatEta float64 `yaml:"MirostatEta,omitempty" json:"mirostat_eta,omitempty"` - PenalizeNewline bool `yaml:"PenalizeNewline,omitempty" json:"penalize_newline,omitempty"` - Stop []string `yaml:"Stop,omitempty" json:"stop,omitempty"` - Numa bool `yaml:"Numa,omitempty" json:"numa,omitempty"` - NumCtx int `yaml:"NumCtx,omitempty" json:"num_ctx,omitempty"` - NumBatch int `yaml:"NumBatch,omitempty" json:"num_batch,omitempty"` - NumGpu int `yaml:"NumGpu,omitempty" json:"num_gpu,omitempty"` - MainGpu int `yaml:"MainGpu,omitempty" json:"main_gpu,omitempty"` - LowVram bool `yaml:"LowVram,omitempty" json:"low_vram,omitempty"` - VocabOnly bool `yaml:"VocabOnly,omitempty" json:"vocab_only,omitempty"` - UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"` - UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"` - NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"` - MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"` - Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"` - ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"` - SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"` - CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"` -} - // ApiRequestContext represents a context parameter returned from a previous request. type ApiRequestContext = []int @@ -84,7 +47,7 @@ type ApiRequest struct { Url string `form:"url" yaml:"Url,omitempty" json:"url,omitempty"` Org string `form:"org" yaml:"Org,omitempty" json:"org,omitempty"` Project string `form:"project" yaml:"Project,omitempty" json:"project,omitempty"` - Options *ApiRequestOptions `form:"options" yaml:"Options,omitempty" json:"options,omitempty"` + Options *ModelOptions `form:"options" yaml:"Options,omitempty" json:"options,omitempty"` Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"` Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"` Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"` diff --git a/internal/ai/vision/engine.go b/internal/ai/vision/engine.go index 67da799ef..bfde99028 100644 --- a/internal/ai/vision/engine.go +++ b/internal/ai/vision/engine.go @@ -36,7 +36,7 @@ type EngineDefaults interface { SystemPrompt(model *Model) string UserPrompt(model *Model) string SchemaTemplate(model *Model) string - Options(model *Model) *ApiRequestOptions + Options(model *Model) *ModelOptions } // Engine groups the callbacks required to integrate a third-party vision service. diff --git a/internal/ai/vision/engine_ollama.go b/internal/ai/vision/engine_ollama.go index 342ba77dd..173b48f0b 100644 --- a/internal/ai/vision/engine_ollama.go +++ b/internal/ai/vision/engine_ollama.go @@ -78,20 +78,20 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string { } // Options returns the Ollama service request options. -func (ollamaDefaults) Options(model *Model) *ApiRequestOptions { +func (ollamaDefaults) Options(model *Model) *ModelOptions { if model == nil { return nil } switch model.Type { case ModelTypeLabels: - return &ApiRequestOptions{ + return &ModelOptions{ Temperature: DefaultTemperature, TopP: 0.9, Stop: []string{"\n\n"}, } case ModelTypeCaption: - return &ApiRequestOptions{ + return &ModelOptions{ Temperature: DefaultTemperature, } default: diff --git a/internal/ai/vision/engine_openai.go b/internal/ai/vision/engine_openai.go index 641a55a1c..c86268045 100644 --- a/internal/ai/vision/engine_openai.go +++ b/internal/ai/vision/engine_openai.go @@ -80,19 +80,19 @@ func (openaiDefaults) SchemaTemplate(model *Model) string { } // Options returns default OpenAI request options for the model. -func (openaiDefaults) Options(model *Model) *ApiRequestOptions { +func (openaiDefaults) Options(model *Model) *ModelOptions { if model == nil { return nil } switch model.Type { case ModelTypeCaption: - return &ApiRequestOptions{ + return &ModelOptions{ Detail: openai.DefaultDetail, MaxOutputTokens: openai.CaptionMaxTokens, } case ModelTypeLabels: - return &ApiRequestOptions{ + return &ModelOptions{ Detail: openai.DefaultDetail, MaxOutputTokens: openai.LabelsMaxTokens, ForceJson: true, diff --git a/internal/ai/vision/engine_openai_test.go b/internal/ai/vision/engine_openai_test.go index 6fa163b3a..38a50e52a 100644 --- a/internal/ai/vision/engine_openai_test.go +++ b/internal/ai/vision/engine_openai_test.go @@ -40,7 +40,7 @@ func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) { Type: ModelTypeCaption, Name: openai.DefaultModel, Engine: openai.EngineName, - Options: &ApiRequestOptions{ForceJson: true}, + Options: &ModelOptions{ForceJson: true}, } model.ApplyEngineDefaults() @@ -59,7 +59,7 @@ func TestApiRequestJSONForOpenAI(t *testing.T) { Prompt: "describe the scene", Images: []string{""}, ResponseFormat: ApiFormatOpenAI, - Options: &ApiRequestOptions{ + Options: &ModelOptions{ Detail: openai.DefaultDetail, MaxOutputTokens: 128, Temperature: 0.2, @@ -111,7 +111,7 @@ func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) { Model: "gpt-5-mini", Images: []string{""}, ResponseFormat: ApiFormatOpenAI, - Options: &ApiRequestOptions{ + Options: &ModelOptions{ Detail: openai.DefaultDetail, MaxOutputTokens: 64, ForceJson: true, @@ -254,7 +254,7 @@ func TestPerformApiRequestOpenAISuccess(t *testing.T) { Model: "gpt-5-mini", Images: []string{""}, ResponseFormat: ApiFormatOpenAI, - Options: &ApiRequestOptions{ + Options: &ModelOptions{ Detail: openai.DefaultDetail, }, Schema: json.RawMessage(`{"type":"object"}`), @@ -299,7 +299,7 @@ func TestPerformApiRequestOpenAITextFallback(t *testing.T) { Model: "gpt-5-mini", Images: []string{""}, ResponseFormat: ApiFormatOpenAI, - Options: &ApiRequestOptions{ + Options: &ModelOptions{ Detail: openai.DefaultDetail, }, Schema: nil, diff --git a/internal/ai/vision/model.go b/internal/ai/vision/model.go index 8efd0c8e4..3ae58828f 100644 --- a/internal/ai/vision/model.go +++ b/internal/ai/vision/model.go @@ -46,7 +46,7 @@ type Model struct { SchemaFile string `yaml:"SchemaFile,omitempty" json:"schemaFile,omitempty"` Resolution int `yaml:"Resolution,omitempty" json:"resolution,omitempty"` TensorFlow *tensorflow.ModelInfo `yaml:"TensorFlow,omitempty" json:"tensorflow,omitempty"` - Options *ApiRequestOptions `yaml:"Options,omitempty" json:"options,omitempty"` + Options *ModelOptions `yaml:"Options,omitempty" json:"options,omitempty"` Service Service `yaml:"Service,omitempty" json:"service,omitempty"` Path string `yaml:"Path,omitempty" json:"-"` Disabled bool `yaml:"Disabled,omitempty" json:"disabled,omitempty"` @@ -334,12 +334,12 @@ func (m *Model) GetSource() string { // GetOptions returns the API request options, applying engine defaults on // demand. Nil receivers return nil. -func (m *Model) GetOptions() *ApiRequestOptions { +func (m *Model) GetOptions() *ModelOptions { if m == nil { return nil } - var engineDefaults *ApiRequestOptions + var engineDefaults *ModelOptions if defaults := m.engineDefaults(); defaults != nil { engineDefaults = cloneOptions(defaults.Options(m)) } @@ -348,7 +348,7 @@ func (m *Model) GetOptions() *ApiRequestOptions { switch m.Type { case ModelTypeLabels, ModelTypeCaption, ModelTypeGenerate: if engineDefaults == nil { - engineDefaults = &ApiRequestOptions{} + engineDefaults = &ModelOptions{} } normalizeOptions(engineDefaults) m.Options = engineDefaults @@ -364,7 +364,7 @@ func (m *Model) GetOptions() *ApiRequestOptions { return m.Options } -func mergeOptionDefaults(target, defaults *ApiRequestOptions) { +func mergeOptionDefaults(target, defaults *ModelOptions) { if target == nil || defaults == nil { return } @@ -402,7 +402,7 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) { } } -func normalizeOptions(opts *ApiRequestOptions) { +func normalizeOptions(opts *ModelOptions) { if opts == nil { return } @@ -412,7 +412,7 @@ func normalizeOptions(opts *ApiRequestOptions) { } } -func cloneOptions(opts *ApiRequestOptions) *ApiRequestOptions { +func cloneOptions(opts *ModelOptions) *ModelOptions { if opts == nil { return nil } diff --git a/internal/ai/vision/model_options.go b/internal/ai/vision/model_options.go new file mode 100644 index 000000000..0a0875a29 --- /dev/null +++ b/internal/ai/vision/model_options.go @@ -0,0 +1,38 @@ +package vision + +// ModelOptions represents additional model parameters listed in the documentation. +type ModelOptions struct { + NumKeep int `yaml:"NumKeep,omitempty" json:"num_keep,omitempty"` // Ollama ↓ + Seed int `yaml:"Seed,omitempty" json:"seed,omitempty"` + NumPredict int `yaml:"NumPredict,omitempty" json:"num_predict,omitempty"` + Temperature float64 `yaml:"Temperature,omitempty" json:"temperature,omitempty"` + TopK int `yaml:"TopK,omitempty" json:"top_k,omitempty"` + TopP float64 `yaml:"TopP,omitempty" json:"top_p,omitempty"` + MinP float64 `yaml:"MinP,omitempty" json:"min_p,omitempty"` + TypicalP float64 `yaml:"TypicalP,omitempty" json:"typical_p,omitempty"` + TfsZ float64 `yaml:"TfsZ,omitempty" json:"tfs_z,omitempty"` + RepeatLastN int `yaml:"RepeatLastN,omitempty" json:"repeat_last_n,omitempty"` + RepeatPenalty float64 `yaml:"RepeatPenalty,omitempty" json:"repeat_penalty,omitempty"` + PresencePenalty float64 `yaml:"PresencePenalty,omitempty" json:"presence_penalty,omitempty"` + FrequencyPenalty float64 `yaml:"FrequencyPenalty,omitempty" json:"frequency_penalty,omitempty"` + Mirostat int `yaml:"Mirostat,omitempty" json:"mirostat,omitempty"` + MirostatTau float64 `yaml:"MirostatTau,omitempty" json:"mirostat_tau,omitempty"` + MirostatEta float64 `yaml:"MirostatEta,omitempty" json:"mirostat_eta,omitempty"` + PenalizeNewline bool `yaml:"PenalizeNewline,omitempty" json:"penalize_newline,omitempty"` + Stop []string `yaml:"Stop,omitempty" json:"stop,omitempty"` + Numa bool `yaml:"Numa,omitempty" json:"numa,omitempty"` + NumCtx int `yaml:"NumCtx,omitempty" json:"num_ctx,omitempty"` + NumBatch int `yaml:"NumBatch,omitempty" json:"num_batch,omitempty"` + NumGpu int `yaml:"NumGpu,omitempty" json:"num_gpu,omitempty"` + MainGpu int `yaml:"MainGpu,omitempty" json:"main_gpu,omitempty"` + LowVram bool `yaml:"LowVram,omitempty" json:"low_vram,omitempty"` + VocabOnly bool `yaml:"VocabOnly,omitempty" json:"vocab_only,omitempty"` + UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"` + UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"` + NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"` + MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"` // OpenAI ↓ + Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"` + ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"` + SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"` + CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"` +} diff --git a/internal/ai/vision/model_test.go b/internal/ai/vision/model_test.go index 743446d8e..a435d35c7 100644 --- a/internal/ai/vision/model_test.go +++ b/internal/ai/vision/model_test.go @@ -158,7 +158,7 @@ func TestModelGetOptionsRespectsCustomValues(t *testing.T) { model := &Model{ Type: ModelTypeLabels, Engine: ollama.EngineName, - Options: &ApiRequestOptions{ + Options: &ModelOptions{ Temperature: 5, TopP: 0.95, Stop: []string{"CUSTOM"}, @@ -183,7 +183,7 @@ func TestModelGetOptionsFillsMissingFields(t *testing.T) { model := &Model{ Type: ModelTypeLabels, Engine: ollama.EngineName, - Options: &ApiRequestOptions{}, + Options: &ModelOptions{}, } model.ApplyEngineDefaults() diff --git a/internal/ai/vision/models.go b/internal/ai/vision/models.go index 71083e05c..c88bef528 100644 --- a/internal/ai/vision/models.go +++ b/internal/ai/vision/models.go @@ -89,7 +89,7 @@ var ( } CaptionModel = &Model{ Type: ModelTypeCaption, - Name: ollama.CaptionModel, + Model: ollama.CaptionModel, Version: VersionLatest, Engine: ollama.EngineName, Resolution: 720, // Original aspect ratio, with a max size of 720 x 720 pixels. diff --git a/internal/ai/vision/ollama/defaults.go b/internal/ai/vision/ollama/defaults.go index 64530def9..ad570ff7c 100644 --- a/internal/ai/vision/ollama/defaults.go +++ b/internal/ai/vision/ollama/defaults.go @@ -4,7 +4,7 @@ const ( // CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence. CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words." // CaptionModel names the default caption model bundled with our adapter defaults. - CaptionModel = "gemma3" + CaptionModel = "gemma3:latest" // LabelConfidenceDefault is used when the model omits the confidence field. LabelConfidenceDefault = 0.5 // LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned. diff --git a/internal/ai/vision/testdata/vision.yml b/internal/ai/vision/testdata/vision.yml index 6529e4f29..bf5e833d9 100644 --- a/internal/ai/vision/testdata/vision.yml +++ b/internal/ai/vision/testdata/vision.yml @@ -65,7 +65,7 @@ Models: Name: embeddings Outputs: 512 - Type: caption - Name: gemma3 + Model: gemma3:latest Version: latest Engine: ollama Resolution: 720