diff --git a/internal/ai/vision/api_client.go b/internal/ai/vision/api_client.go
index 29c24b259..62f757dfe 100644
--- a/internal/ai/vision/api_client.go
+++ b/internal/ai/vision/api_client.go
@@ -9,6 +9,9 @@ import (
"io"
"net/http"
+ "github.com/sirupsen/logrus"
+
+ "github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/header"
)
@@ -69,6 +72,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
return nil, parseErr
}
+ if log.IsLevelEnabled(logrus.TraceLevel) {
+ log.Tracef("vision: response %s", string(body))
+ }
+
return parsed, nil
}
@@ -89,12 +96,12 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
return apiResponse, nil
}
-func decodeOllamaResponse(data []byte) (*ApiResponseOllama, error) {
- resp := &ApiResponseOllama{}
+func decodeOllamaResponse(data []byte) (*ollama.Response, error) {
+ resp := &ollama.Response{}
dec := json.NewDecoder(bytes.NewReader(data))
for {
- var chunk ApiResponseOllama
+ var chunk ollama.Response
if err := dec.Decode(&chunk); err != nil {
if errors.Is(err, io.EOF) {
break
diff --git a/internal/ai/vision/api_client_test.go b/internal/ai/vision/api_client_test.go
index d933a1227..d84219086 100644
--- a/internal/ai/vision/api_client_test.go
+++ b/internal/ai/vision/api_client_test.go
@@ -8,6 +8,7 @@ import (
"github.com/stretchr/testify/assert"
+ "github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
@@ -49,7 +50,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
var req ApiRequest
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
assert.Equal(t, FormatJSON, req.Format)
- assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
+ assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "qwen2.5vl:latest",
Response: `{"labels":[{"name":"test","confidence":0.9,"topicality":0.8}]}`,
}))
@@ -72,7 +73,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
})
t.Run("LabelsWithCodeFence", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
+ assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "gemma3:latest",
Response: "```json\n{\"labels\":[{\"name\":\"lingerie\",\"confidence\":0.81,\"topicality\":0.73}]}\n```\nThe model provided additional commentary.",
}))
@@ -95,7 +96,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
})
t.Run("CaptionFallback", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
+ assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "qwen2.5vl:latest",
Response: "plain text",
}))
diff --git a/internal/ai/vision/api_ollama.go b/internal/ai/vision/api_ollama.go
index 56f70d454..8231234c6 100644
--- a/internal/ai/vision/api_ollama.go
+++ b/internal/ai/vision/api_ollama.go
@@ -1,10 +1,8 @@
package vision
import (
- "errors"
"fmt"
"os"
- "time"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
@@ -12,53 +10,6 @@ import (
"github.com/photoprism/photoprism/pkg/rnd"
)
-// ApiResponseOllama represents a Ollama API service response.
-type ApiResponseOllama struct {
- Id string `yaml:"Id,omitempty" json:"id,omitempty"`
- Code int `yaml:"Code,omitempty" json:"code,omitempty"`
- Error string `yaml:"Error,omitempty" json:"error,omitempty"`
- Model string `yaml:"Model,omitempty" json:"model,omitempty"`
- CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
- Response string `yaml:"Response,omitempty" json:"response,omitempty"`
- Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
- Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
- TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
- LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
- PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
- PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
- EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
- EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
- Result ApiResult `yaml:"Result,omitempty" json:"result,omitempty"`
-}
-
-// Err returns an error if the request has failed.
-func (r *ApiResponseOllama) Err() error {
- if r == nil {
- return errors.New("response is nil")
- }
-
- if r.Code >= 400 {
- if r.Error != "" {
- return errors.New(r.Error)
- }
-
- return fmt.Errorf("error %d", r.Code)
- } else if r.Result.IsEmpty() {
- return errors.New("no result")
- }
-
- return nil
-}
-
-// HasResult checks if there is at least one result in the response data.
-func (r *ApiResponseOllama) HasResult() bool {
- if r == nil {
- return false
- }
-
- return !r.Result.IsEmpty()
-}
-
// NewApiRequestOllama returns a new Ollama API request with the specified images as payload.
func NewApiRequestOllama(images Files, fileScheme scheme.Type) (*ApiRequest, error) {
imagesData := make(Files, len(images))
diff --git a/internal/ai/vision/api_request.go b/internal/ai/vision/api_request.go
index 6ca0450bb..f227b7a75 100644
--- a/internal/ai/vision/api_request.go
+++ b/internal/ai/vision/api_request.go
@@ -11,6 +11,8 @@ import (
"github.com/sirupsen/logrus"
+ "github.com/photoprism/photoprism/internal/ai/vision/openai"
+ "github.com/photoprism/photoprism/internal/ai/vision/schema"
"github.com/photoprism/photoprism/internal/api/download"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/fs"
@@ -58,6 +60,11 @@ type ApiRequestOptions struct {
UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"`
UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"`
NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"`
+ MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"`
+ Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"`
+ ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"`
+ SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"`
+ CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"`
}
// ApiRequestContext represents a context parameter returned from a previous request.
@@ -77,6 +84,7 @@ type ApiRequest struct {
Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"`
Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"`
Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"`
+ Schema json.RawMessage `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"`
ResponseFormat ApiFormat `form:"-" yaml:"-" json:"-"`
}
@@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat {
// JSON returns the request data as JSON-encoded bytes.
func (r *ApiRequest) JSON() ([]byte, error) {
+ if r == nil {
+ return nil, errors.New("api request is nil")
+ }
+
+ if r.ResponseFormat == ApiFormatOpenAI {
+ return r.openAIJSON()
+ }
+
return json.Marshal(*r)
}
@@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest {
sanitized.Url = sanitizeLogPayload(r.Url)
+ sanitized.Schema = r.Schema
+
return sanitized
}
@@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool {
return true
}
+
+// openAIJSON converts the request data into an OpenAI Responses API payload.
+func (r *ApiRequest) openAIJSON() ([]byte, error) {
+ detail := openai.DefaultDetail
+
+ if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" {
+ detail = strings.TrimSpace(opts.Detail)
+ }
+
+ messages := make([]openai.InputMessage, 0, 2)
+
+ if system := strings.TrimSpace(r.System); system != "" {
+ messages = append(messages, openai.InputMessage{
+ Role: "system",
+ Type: "message",
+ Content: []openai.ContentItem{
+ {
+ Type: openai.ContentTypeText,
+ Text: system,
+ },
+ },
+ })
+ }
+
+ userContent := make([]openai.ContentItem, 0, len(r.Images)+1)
+
+ if prompt := strings.TrimSpace(r.Prompt); prompt != "" {
+ userContent = append(userContent, openai.ContentItem{
+ Type: openai.ContentTypeText,
+ Text: prompt,
+ })
+ }
+
+ for _, img := range r.Images {
+ if img == "" {
+ continue
+ }
+
+ userContent = append(userContent, openai.ContentItem{
+ Type: openai.ContentTypeImage,
+ ImageURL: img,
+ Detail: detail,
+ })
+ }
+
+ if len(userContent) > 0 {
+ messages = append(messages, openai.InputMessage{
+ Role: "user",
+ Type: "message",
+ Content: userContent,
+ })
+ }
+
+ if len(messages) == 0 {
+ return nil, errors.New("openai request requires at least one message")
+ }
+
+ payload := openai.HTTPRequest{
+ Model: strings.TrimSpace(r.Model),
+ Input: messages,
+ }
+
+ if payload.Model == "" {
+ payload.Model = openai.DefaultModel
+ }
+
+ if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") {
+ payload.Reasoning = &openai.Reasoning{Effort: "low"}
+ }
+
+ if opts := r.Options; opts != nil {
+ if opts.MaxOutputTokens > 0 {
+ payload.MaxOutputTokens = opts.MaxOutputTokens
+ }
+
+ if opts.Temperature > 0 {
+ payload.Temperature = opts.Temperature
+ }
+
+ if opts.TopP > 0 {
+ payload.TopP = opts.TopP
+ }
+
+ if opts.PresencePenalty != 0 {
+ payload.PresencePenalty = opts.PresencePenalty
+ }
+
+ if opts.FrequencyPenalty != 0 {
+ payload.FrequencyPenalty = opts.FrequencyPenalty
+ }
+ }
+
+ if format := buildOpenAIResponseFormat(r); format != nil {
+ payload.Text = &openai.TextOptions{
+ Format: format,
+ }
+ }
+
+ return json.Marshal(payload)
+}
+
+// buildOpenAIResponseFormat determines which response_format to send to OpenAI.
+func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat {
+ if r == nil {
+ return nil
+ }
+
+ opts := r.Options
+ hasSchema := len(r.Schema) > 0
+
+ if !hasSchema && (opts == nil || !opts.ForceJson) {
+ return nil
+ }
+
+ result := &openai.ResponseFormat{}
+
+ if hasSchema {
+ result.Type = openai.ResponseFormatJSONSchema
+ result.Schema = r.Schema
+
+ if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" {
+ result.Name = strings.TrimSpace(opts.SchemaVersion)
+ } else {
+ result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion)
+ }
+ } else {
+ result.Type = openai.ResponseFormatJSONObject
+ }
+
+ return result
+}
diff --git a/internal/ai/vision/caption.go b/internal/ai/vision/caption.go
index 585795f7b..6d27eae32 100644
--- a/internal/ai/vision/caption.go
+++ b/internal/ai/vision/caption.go
@@ -53,7 +53,11 @@ func captionInternal(images Files, mediaSrc media.Src) (result *CaptionResult, m
apiRequest.System = model.GetSystemPrompt()
apiRequest.Prompt = model.GetPrompt()
- apiRequest.Options = model.GetOptions()
+
+ if apiRequest.Options == nil {
+ apiRequest.Options = model.GetOptions()
+ }
+
apiRequest.WriteLog()
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
diff --git a/internal/ai/vision/engine.go b/internal/ai/vision/engine.go
index 8839fc97f..67da799ef 100644
--- a/internal/ai/vision/engine.go
+++ b/internal/ai/vision/engine.go
@@ -58,14 +58,15 @@ func init() {
RegisterEngineAlias(EngineVision, EngineInfo{
RequestFormat: ApiFormatVision,
ResponseFormat: ApiFormatVision,
- FileScheme: string(scheme.Data),
+ FileScheme: scheme.Data,
DefaultResolution: DefaultResolution,
})
RegisterEngineAlias(openai.EngineName, EngineInfo{
+ Uri: "https://api.openai.com/v1/responses",
RequestFormat: ApiFormatOpenAI,
ResponseFormat: ApiFormatOpenAI,
- FileScheme: string(scheme.Data),
+ FileScheme: scheme.Data,
DefaultResolution: openai.DefaultResolution,
})
}
@@ -79,6 +80,7 @@ func RegisterEngine(format ApiFormat, engine Engine) {
// EngineInfo describes metadata that can be associated with an engine alias.
type EngineInfo struct {
+ Uri string
RequestFormat ApiFormat
ResponseFormat ApiFormat
FileScheme string
diff --git a/internal/ai/vision/engine_ollama.go b/internal/ai/vision/engine_ollama.go
index 413078dc4..816505037 100644
--- a/internal/ai/vision/engine_ollama.go
+++ b/internal/ai/vision/engine_ollama.go
@@ -28,7 +28,7 @@ func init() {
RegisterEngineAlias(ollama.EngineName, EngineInfo{
RequestFormat: ApiFormatOllama,
ResponseFormat: ApiFormatOllama,
- FileScheme: string(scheme.Base64),
+ FileScheme: scheme.Base64,
DefaultResolution: ollama.DefaultResolution,
})
@@ -72,7 +72,7 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string {
switch model.Type {
case ModelTypeLabels:
- return ollama.LabelsSchema(model.PromptContains("nsfw"))
+ return ollama.SchemaLabels(model.PromptContains("nsfw"))
}
return ""
@@ -134,64 +134,93 @@ func (ollamaParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, stat
return nil, err
}
- result := &ApiResponse{
+ response := &ApiResponse{
Id: req.GetId(),
Code: status,
Model: &Model{Name: ollamaResp.Model},
Result: ApiResult{
- Labels: append([]LabelResult{}, ollamaResp.Result.Labels...),
- Caption: func() *CaptionResult {
- if ollamaResp.Result.Caption != nil {
- copyCaption := *ollamaResp.Result.Caption
- return ©Caption
- }
- return nil
- }(),
+ Labels: convertOllamaLabels(ollamaResp.Result.Labels),
+ Caption: convertOllamaCaption(ollamaResp.Result.Caption),
},
}
- parsedLabels := len(result.Result.Labels) > 0
+ parsedLabels := len(response.Result.Labels) > 0
if !parsedLabels && strings.TrimSpace(ollamaResp.Response) != "" && req.Format == FormatJSON {
if labels, parseErr := parseOllamaLabels(ollamaResp.Response); parseErr != nil {
log.Debugf("vision: %s (parse ollama labels)", clean.Error(parseErr))
} else if len(labels) > 0 {
- result.Result.Labels = append(result.Result.Labels, labels...)
+ response.Result.Labels = append(response.Result.Labels, labels...)
parsedLabels = true
}
}
if parsedLabels {
- filtered := result.Result.Labels[:0]
- for i := range result.Result.Labels {
- if result.Result.Labels[i].Confidence <= 0 {
- result.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
+ filtered := response.Result.Labels[:0]
+ for i := range response.Result.Labels {
+ if response.Result.Labels[i].Confidence <= 0 {
+ response.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
}
- if result.Result.Labels[i].Topicality <= 0 {
- result.Result.Labels[i].Topicality = result.Result.Labels[i].Confidence
+ if response.Result.Labels[i].Topicality <= 0 {
+ response.Result.Labels[i].Topicality = response.Result.Labels[i].Confidence
}
// Apply thresholds and canonicalize the name.
- normalizeLabelResult(&result.Result.Labels[i])
+ normalizeLabelResult(&response.Result.Labels[i])
- if result.Result.Labels[i].Name == "" {
+ if response.Result.Labels[i].Name == "" {
continue
}
- if result.Result.Labels[i].Source == "" {
- result.Result.Labels[i].Source = entity.SrcOllama
+ if response.Result.Labels[i].Source == "" {
+ response.Result.Labels[i].Source = entity.SrcOllama
}
- filtered = append(filtered, result.Result.Labels[i])
+ filtered = append(filtered, response.Result.Labels[i])
}
- result.Result.Labels = filtered
+ response.Result.Labels = filtered
} else if caption := strings.TrimSpace(ollamaResp.Response); caption != "" {
- result.Result.Caption = &CaptionResult{
+ response.Result.Caption = &CaptionResult{
Text: caption,
Source: entity.SrcOllama,
}
}
- return result, nil
+ return response, nil
+}
+
+func convertOllamaLabels(payload []ollama.LabelPayload) []LabelResult {
+ if len(payload) == 0 {
+ return nil
+ }
+
+ labels := make([]LabelResult, len(payload))
+
+ for i := range payload {
+ labels[i] = LabelResult{
+ Name: payload[i].Name,
+ Source: payload[i].Source,
+ Priority: payload[i].Priority,
+ Confidence: payload[i].Confidence,
+ Topicality: payload[i].Topicality,
+ Categories: payload[i].Categories,
+ NSFW: payload[i].NSFW,
+ NSFWConfidence: payload[i].NSFWConfidence,
+ }
+ }
+
+ return labels
+}
+
+func convertOllamaCaption(payload *ollama.CaptionPayload) *CaptionResult {
+ if payload == nil {
+ return nil
+ }
+
+ return &CaptionResult{
+ Text: payload.Text,
+ Source: payload.Source,
+ Confidence: payload.Confidence,
+ }
}
diff --git a/internal/ai/vision/engine_ollama_test.go b/internal/ai/vision/engine_ollama_test.go
index dffc6fe7d..44d62bdeb 100644
--- a/internal/ai/vision/engine_ollama_test.go
+++ b/internal/ai/vision/engine_ollama_test.go
@@ -10,9 +10,9 @@ import (
func TestOllamaDefaultConfidenceApplied(t *testing.T) {
req := &ApiRequest{Format: FormatJSON}
- payload := ApiResponseOllama{
- Result: ApiResult{
- Labels: []LabelResult{{Name: "forest path", Confidence: 0, Topicality: 0}},
+ payload := ollama.Response{
+ Result: ollama.ResultPayload{
+ Labels: []ollama.LabelPayload{{Name: "forest path", Confidence: 0, Topicality: 0}},
},
}
raw, err := json.Marshal(payload)
diff --git a/internal/ai/vision/engine_openai.go b/internal/ai/vision/engine_openai.go
index 8a73c2431..3cb2a226e 100644
--- a/internal/ai/vision/engine_openai.go
+++ b/internal/ai/vision/engine_openai.go
@@ -1,18 +1,342 @@
package vision
import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "strings"
+
"github.com/photoprism/photoprism/internal/ai/vision/openai"
+ "github.com/photoprism/photoprism/internal/entity"
+ "github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
-// init registers the OpenAI engine alias so models can set Engine: "openai"
-// and inherit sensible defaults (request/response formats, file scheme, and
-// preferred thumbnail resolution).
+// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
+type openaiDefaults struct{}
+
+// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
+type openaiBuilder struct{}
+
+// openaiParser converts Responses API payloads into ApiResponse instances.
+type openaiParser struct{}
+
func init() {
- RegisterEngineAlias(openai.EngineName, EngineInfo{
- RequestFormat: ApiFormatOpenAI,
- ResponseFormat: ApiFormatOpenAI,
- FileScheme: string(scheme.Base64),
- DefaultResolution: openai.DefaultResolution,
+ RegisterEngine(ApiFormatOpenAI, Engine{
+ Builder: openaiBuilder{},
+ Parser: openaiParser{},
+ Defaults: openaiDefaults{},
})
}
+
+// SystemPrompt returns the default OpenAI system prompt for the specified model type.
+func (openaiDefaults) SystemPrompt(model *Model) string {
+ if model == nil {
+ return ""
+ }
+
+ switch model.Type {
+ case ModelTypeCaption:
+ return openai.CaptionSystem
+ case ModelTypeLabels:
+ return openai.LabelSystem
+ default:
+ return ""
+ }
+}
+
+// UserPrompt returns the default OpenAI user prompt for the specified model type.
+func (openaiDefaults) UserPrompt(model *Model) string {
+ if model == nil {
+ return ""
+ }
+
+ switch model.Type {
+ case ModelTypeCaption:
+ return openai.CaptionPrompt
+ case ModelTypeLabels:
+ if DetectNSFWLabels {
+ return openai.LabelPromptNSFW
+ }
+ return openai.LabelPromptDefault
+ default:
+ return ""
+ }
+}
+
+// SchemaTemplate returns the JSON schema template for the model, if applicable.
+func (openaiDefaults) SchemaTemplate(model *Model) string {
+ if model == nil {
+ return ""
+ }
+
+ switch model.Type {
+ case ModelTypeLabels:
+ return string(openai.SchemaLabels(model.PromptContains("nsfw")))
+ default:
+ return ""
+ }
+}
+
+// Options returns default OpenAI request options for the model.
+func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
+ if model == nil {
+ return nil
+ }
+
+ switch model.Type {
+ case ModelTypeCaption:
+ /*
+ Options:
+ Detail: low
+ MaxOutputTokens: 512
+ Temperature: 0.1
+ TopP: 0.9
+ (Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
+ */
+ return &ApiRequestOptions{
+ Detail: openai.DefaultDetail,
+ MaxOutputTokens: openai.CaptionMaxTokens,
+ Temperature: openai.DefaultTemperature,
+ TopP: openai.DefaultTopP,
+ }
+ case ModelTypeLabels:
+ /*
+ Options:
+ Detail: low
+ MaxOutputTokens: 1024
+ Temperature: 0.1
+ ForceJson: true
+ SchemaVersion: "photoprism_vision_labels_v1"
+ (Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
+ */
+ return &ApiRequestOptions{
+ Detail: openai.DefaultDetail,
+ MaxOutputTokens: openai.LabelsMaxTokens,
+ Temperature: openai.DefaultTemperature,
+ TopP: openai.DefaultTopP,
+ ForceJson: true,
+ }
+ default:
+ return nil
+ }
+}
+
+// Build constructs an OpenAI request payload using base64-encoded thumbnails.
+func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
+ if model == nil {
+ return nil, ErrInvalidModel
+ }
+
+ dataReq, err := NewApiRequestImages(files, scheme.Data)
+ if err != nil {
+ return nil, err
+ }
+
+ req := &ApiRequest{
+ Id: dataReq.Id,
+ Images: append(Files(nil), dataReq.Images...),
+ ResponseFormat: ApiFormatOpenAI,
+ }
+
+ if opts := model.GetOptions(); opts != nil {
+ req.Options = cloneOptions(opts)
+ if model.Type == ModelTypeCaption {
+ // Captions default to plain text responses; structured JSON is optional.
+ req.Options.ForceJson = false
+ if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
+ req.Options.MaxOutputTokens = openai.CaptionMaxTokens
+ }
+ } else if model.Type == ModelTypeLabels {
+ if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
+ req.Options.MaxOutputTokens = openai.LabelsMaxTokens
+ }
+ }
+
+ if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
+ req.Options.Temperature = 0
+ req.Options.TopP = 0
+ }
+ }
+
+ if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
+ if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
+ log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
+ } else {
+ req.Schema = raw
+ }
+ }
+
+ return req, nil
+}
+
+// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
+func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
+ if status >= 300 {
+ if msg := openai.ParseErrorMessage(raw); msg != "" {
+ return nil, fmt.Errorf("openai: %s", msg)
+ }
+ return nil, fmt.Errorf("openai: status %d", status)
+ }
+
+ var resp openai.Response
+ if err := json.Unmarshal(raw, &resp); err != nil {
+ return nil, err
+ }
+
+ if resp.Error != nil && resp.Error.Message != "" {
+ return nil, errors.New(resp.Error.Message)
+ }
+
+ result := ApiResult{}
+ if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
+ if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
+ log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
+ }
+ }
+
+ if result.Caption == nil {
+ if text := resp.FirstText(); text != "" {
+ trimmed := strings.TrimSpace(text)
+ var parsedJSON bool
+
+ if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
+ if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
+ log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
+ } else {
+ parsedJSON = true
+ }
+ }
+
+ if !parsedJSON && trimmed != "" {
+ result.Caption = &CaptionResult{
+ Text: trimmed,
+ Source: entity.SrcOpenAI,
+ }
+ }
+ }
+ }
+
+ var responseID string
+ if req != nil {
+ responseID = req.GetId()
+ }
+
+ modelName := strings.TrimSpace(resp.Model)
+ if modelName == "" && req != nil {
+ modelName = strings.TrimSpace(req.Model)
+ }
+
+ return &ApiResponse{
+ Id: responseID,
+ Code: status,
+ Model: &Model{Name: modelName},
+ Result: result,
+ }, nil
+}
+
+// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
+func parseOpenAISchema(schema string) (json.RawMessage, error) {
+ var raw json.RawMessage
+ if err := json.Unmarshal([]byte(schema), &raw); err != nil {
+ return nil, err
+ }
+ return normalizeOpenAISchema(raw)
+}
+
+// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
+// OpenAI's json_schema format requirements.
+func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
+ if len(raw) == 0 {
+ return raw, nil
+ }
+
+ var doc map[string]any
+ if err := json.Unmarshal(raw, &doc); err != nil {
+ // Fallback to the original payload if it isn't a JSON object.
+ return raw, nil
+ }
+
+ if t, ok := doc["type"]; ok {
+ if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
+ return raw, nil
+ }
+ }
+
+ if _, ok := doc["properties"]; ok {
+ return raw, nil
+ }
+
+ labels, ok := doc["labels"]
+ if !ok {
+ return raw, nil
+ }
+
+ nsfw := false
+
+ if items, ok := labels.([]any); ok && len(items) > 0 {
+ if first, ok := items[0].(map[string]any); ok {
+ if _, hasNSFW := first["nsfw"]; hasNSFW {
+ nsfw = true
+ }
+ if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
+ nsfw = true
+ }
+ }
+ }
+
+ return openai.SchemaLabels(nsfw), nil
+}
+
+// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
+func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
+ if result == nil || len(payload) == 0 {
+ return nil
+ }
+
+ var envelope struct {
+ Caption *struct {
+ Text string `json:"text"`
+ Confidence float32 `json:"confidence"`
+ } `json:"caption"`
+ Labels []LabelResult `json:"labels"`
+ }
+
+ if err := json.Unmarshal(payload, &envelope); err != nil {
+ return err
+ }
+
+ if envelope.Caption != nil {
+ text := strings.TrimSpace(envelope.Caption.Text)
+ if text != "" {
+ result.Caption = &CaptionResult{
+ Text: text,
+ Confidence: envelope.Caption.Confidence,
+ Source: entity.SrcOpenAI,
+ }
+ }
+ }
+
+ if len(envelope.Labels) > 0 {
+ filtered := envelope.Labels[:0]
+
+ for i := range envelope.Labels {
+ if envelope.Labels[i].Source == "" {
+ envelope.Labels[i].Source = entity.SrcOpenAI
+ }
+
+ normalizeLabelResult(&envelope.Labels[i])
+
+ if envelope.Labels[i].Name == "" {
+ continue
+ }
+
+ filtered = append(filtered, envelope.Labels[i])
+ }
+
+ result.Labels = append(result.Labels, filtered...)
+ }
+
+ return nil
+}
diff --git a/internal/ai/vision/engine_openai_test.go b/internal/ai/vision/engine_openai_test.go
new file mode 100644
index 000000000..6fa163b3a
--- /dev/null
+++ b/internal/ai/vision/engine_openai_test.go
@@ -0,0 +1,337 @@
+package vision
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "github.com/photoprism/photoprism/internal/ai/vision/openai"
+ "github.com/photoprism/photoprism/internal/ai/vision/schema"
+ "github.com/photoprism/photoprism/internal/entity"
+)
+
+func TestOpenAIBuilderBuild(t *testing.T) {
+ model := &Model{
+ Type: ModelTypeLabels,
+ Name: openai.DefaultModel,
+ Engine: openai.EngineName,
+ }
+ model.ApplyEngineDefaults()
+
+ request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
+ require.NoError(t, err)
+ require.NotNil(t, request)
+
+ assert.Equal(t, ApiFormatOpenAI, request.ResponseFormat)
+ assert.NotEmpty(t, request.Images)
+ assert.NotNil(t, request.Options)
+ assert.Equal(t, openai.DefaultDetail, request.Options.Detail)
+ assert.True(t, request.Options.ForceJson)
+ assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.LabelsMaxTokens)
+}
+
+func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) {
+ model := &Model{
+ Type: ModelTypeCaption,
+ Name: openai.DefaultModel,
+ Engine: openai.EngineName,
+ Options: &ApiRequestOptions{ForceJson: true},
+ }
+ model.ApplyEngineDefaults()
+
+ request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
+ require.NoError(t, err)
+ require.NotNil(t, request)
+ require.NotNil(t, request.Options)
+ assert.False(t, request.Options.ForceJson)
+ assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.CaptionMaxTokens)
+}
+
+func TestApiRequestJSONForOpenAI(t *testing.T) {
+ req := &ApiRequest{
+ Model: "gpt-5-mini",
+ System: "system",
+ Prompt: "describe the scene",
+ Images: []string{""},
+ ResponseFormat: ApiFormatOpenAI,
+ Options: &ApiRequestOptions{
+ Detail: openai.DefaultDetail,
+ MaxOutputTokens: 128,
+ Temperature: 0.2,
+ TopP: 0.8,
+ ForceJson: true,
+ },
+ Schema: json.RawMessage(`{"type":"object","properties":{"caption":{"type":"object"}}}`),
+ }
+
+ payload, err := req.JSON()
+ require.NoError(t, err)
+
+ var decoded struct {
+ Model string `json:"model"`
+ Input []struct {
+ Role string `json:"role"`
+ Content []struct {
+ Type string `json:"type"`
+ } `json:"content"`
+ } `json:"input"`
+ Text struct {
+ Format struct {
+ Type string `json:"type"`
+ Name string `json:"name"`
+ Schema json.RawMessage `json:"schema"`
+ Strict bool `json:"strict"`
+ } `json:"format"`
+ } `json:"text"`
+ Reasoning struct {
+ Effort string `json:"effort"`
+ } `json:"reasoning"`
+ MaxOutputTokens int `json:"max_output_tokens"`
+ }
+
+ require.NoError(t, json.Unmarshal(payload, &decoded))
+ assert.Equal(t, "gpt-5-mini", decoded.Model)
+ require.Len(t, decoded.Input, 2)
+ assert.Equal(t, "system", decoded.Input[0].Role)
+ assert.Equal(t, openai.ResponseFormatJSONSchema, decoded.Text.Format.Type)
+ assert.Equal(t, schema.JsonSchemaName(decoded.Text.Format.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
+ assert.False(t, decoded.Text.Format.Strict)
+ assert.NotNil(t, decoded.Text.Format.Schema)
+ assert.Equal(t, "low", decoded.Reasoning.Effort)
+ assert.Equal(t, 128, decoded.MaxOutputTokens)
+}
+
+func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) {
+ req := &ApiRequest{
+ Model: "gpt-5-mini",
+ Images: []string{""},
+ ResponseFormat: ApiFormatOpenAI,
+ Options: &ApiRequestOptions{
+ Detail: openai.DefaultDetail,
+ MaxOutputTokens: 64,
+ ForceJson: true,
+ },
+ Schema: json.RawMessage(`{"type":"object"}`),
+ }
+
+ payload, err := req.JSON()
+ require.NoError(t, err)
+
+ var decoded struct {
+ Text struct {
+ Format struct {
+ Name string `json:"name"`
+ } `json:"format"`
+ } `json:"text"`
+ }
+
+ require.NoError(t, json.Unmarshal(payload, &decoded))
+ assert.Equal(t, schema.JsonSchemaName(req.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
+}
+
+func TestOpenAIParserParsesJSONFromTextPayload(t *testing.T) {
+ respPayload := `{
+ "id": "resp_123",
+ "model": "gpt-5-mini",
+ "output": [{
+ "role": "assistant",
+ "content": [{
+ "type": "output_text",
+ "text": "{\"labels\":[{\"name\":\"deer\",\"confidence\":0.98,\"topicality\":0.99}]}"
+ }]
+ }]
+ }`
+
+ req := &ApiRequest{
+ Id: "test",
+ Model: "gpt-5-mini",
+ ResponseFormat: ApiFormatOpenAI,
+ }
+
+ resp, err := openaiParser{}.Parse(context.Background(), req, []byte(respPayload), http.StatusOK)
+ require.NoError(t, err)
+ require.NotNil(t, resp)
+ require.Len(t, resp.Result.Labels, 1)
+ assert.Equal(t, "Deer", resp.Result.Labels[0].Name)
+ assert.Nil(t, resp.Result.Caption)
+}
+
+func TestParseOpenAISchemaLegacyUpgrade(t *testing.T) {
+ legacy := `{
+ "labels": [{
+ "name": "",
+ "confidence": 0,
+ "topicality": 0
+ }]
+ }`
+
+ raw, err := parseOpenAISchema(legacy)
+ require.NoError(t, err)
+
+ var decoded map[string]any
+ require.NoError(t, json.Unmarshal(raw, &decoded))
+
+ assert.Equal(t, "object", decoded["type"])
+
+ props, ok := decoded["properties"].(map[string]any)
+ require.True(t, ok)
+ labels, ok := props["labels"].(map[string]any)
+ require.True(t, ok)
+ assert.Equal(t, "array", labels["type"])
+}
+
+func TestParseOpenAISchemaLegacyUpgradeNSFW(t *testing.T) {
+ legacy := `{
+ "labels": [{
+ "name": "",
+ "confidence": 0,
+ "topicality": 0,
+ "nsfw": false,
+ "nsfw_confidence": 0
+ }]
+ }`
+
+ raw, err := parseOpenAISchema(legacy)
+ require.NoError(t, err)
+
+ var decoded map[string]any
+ require.NoError(t, json.Unmarshal(raw, &decoded))
+
+ props := decoded["properties"].(map[string]any)
+ labels := props["labels"].(map[string]any)
+ items := labels["items"].(map[string]any)
+ _, hasNSFW := items["properties"].(map[string]any)["nsfw"]
+ assert.True(t, hasNSFW)
+}
+
+func TestPerformApiRequestOpenAISuccess(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ var reqPayload struct {
+ Model string `json:"model"`
+ }
+ assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqPayload))
+ assert.Equal(t, "gpt-5-mini", reqPayload.Model)
+
+ response := map[string]any{
+ "id": "resp_123",
+ "model": "gpt-5-mini",
+ "output": []any{
+ map[string]any{
+ "role": "assistant",
+ "content": []any{
+ map[string]any{
+ "type": "output_json",
+ "json": map[string]any{
+ "caption": map[string]any{
+ "text": "A cat rests on a windowsill.",
+ "confidence": 0.91,
+ },
+ "labels": []map[string]any{
+ {
+ "name": "cat",
+ "confidence": 0.92,
+ "topicality": 0.88,
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ }
+
+ assert.NoError(t, json.NewEncoder(w).Encode(response))
+ }))
+ defer server.Close()
+
+ req := &ApiRequest{
+ Id: "test",
+ Model: "gpt-5-mini",
+ Images: []string{""},
+ ResponseFormat: ApiFormatOpenAI,
+ Options: &ApiRequestOptions{
+ Detail: openai.DefaultDetail,
+ },
+ Schema: json.RawMessage(`{"type":"object"}`),
+ }
+
+ resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "secret")
+ require.NoError(t, err)
+ require.NotNil(t, resp)
+
+ require.NotNil(t, resp.Result.Caption)
+ assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
+ assert.Equal(t, "A cat rests on a windowsill.", resp.Result.Caption.Text)
+
+ require.Len(t, resp.Result.Labels, 1)
+ assert.Equal(t, entity.SrcOpenAI, resp.Result.Labels[0].Source)
+ assert.Equal(t, "Cat", resp.Result.Labels[0].Name)
+}
+
+func TestPerformApiRequestOpenAITextFallback(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ response := map[string]any{
+ "id": "resp_456",
+ "model": "gpt-5-mini",
+ "output": []any{
+ map[string]any{
+ "role": "assistant",
+ "content": []any{
+ map[string]any{
+ "type": "output_text",
+ "text": "Two hikers reach the summit at sunset.",
+ },
+ },
+ },
+ },
+ }
+ assert.NoError(t, json.NewEncoder(w).Encode(response))
+ }))
+ defer server.Close()
+
+ req := &ApiRequest{
+ Id: "fallback",
+ Model: "gpt-5-mini",
+ Images: []string{""},
+ ResponseFormat: ApiFormatOpenAI,
+ Options: &ApiRequestOptions{
+ Detail: openai.DefaultDetail,
+ },
+ Schema: nil,
+ }
+
+ resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
+ require.NoError(t, err)
+ require.NotNil(t, resp.Result.Caption)
+ assert.Equal(t, "Two hikers reach the summit at sunset.", resp.Result.Caption.Text)
+ assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
+}
+
+func TestPerformApiRequestOpenAIError(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusBadRequest)
+ _ = json.NewEncoder(w).Encode(map[string]any{
+ "error": map[string]any{
+ "message": "Invalid image payload",
+ },
+ })
+ }))
+ defer server.Close()
+
+ req := &ApiRequest{
+ Id: "error",
+ Model: "gpt-5-mini",
+ ResponseFormat: ApiFormatOpenAI,
+ Schema: nil,
+ Images: []string{""},
+ }
+
+ _, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "Invalid image payload")
+}
diff --git a/internal/ai/vision/labels.go b/internal/ai/vision/labels.go
index f80f90da8..a148bbfa1 100644
--- a/internal/ai/vision/labels.go
+++ b/internal/ai/vision/labels.go
@@ -96,8 +96,10 @@ func labelsInternal(images Files, mediaSrc media.Src, labelSrc entity.Src) (resu
apiRequest.Prompt = prompt
}
- if options := model.GetOptions(); options != nil {
- apiRequest.Options = options
+ if apiRequest.Options == nil {
+ if options := model.GetOptions(); options != nil {
+ apiRequest.Options = options
+ }
}
apiRequest.WriteLog()
diff --git a/internal/ai/vision/model.go b/internal/ai/vision/model.go
index 8054eb1cb..db2f3a275 100644
--- a/internal/ai/vision/model.go
+++ b/internal/ai/vision/model.go
@@ -348,6 +348,26 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) {
if len(target.Stop) == 0 && len(defaults.Stop) > 0 {
target.Stop = append([]string(nil), defaults.Stop...)
}
+
+ if target.MaxOutputTokens <= 0 && defaults.MaxOutputTokens > 0 {
+ target.MaxOutputTokens = defaults.MaxOutputTokens
+ }
+
+ if strings.TrimSpace(target.Detail) == "" && strings.TrimSpace(defaults.Detail) != "" {
+ target.Detail = strings.TrimSpace(defaults.Detail)
+ }
+
+ if !target.ForceJson && defaults.ForceJson {
+ target.ForceJson = true
+ }
+
+ if target.SchemaVersion == "" && defaults.SchemaVersion != "" {
+ target.SchemaVersion = defaults.SchemaVersion
+ }
+
+ if target.CombineOutputs == "" && defaults.CombineOutputs != "" {
+ target.CombineOutputs = defaults.CombineOutputs
+ }
}
func normalizeOptions(opts *ApiRequestOptions) {
@@ -422,6 +442,10 @@ func (m *Model) ApplyEngineDefaults() {
}
if info, ok := EngineInfoFor(engine); ok {
+ if m.Service.Uri == "" {
+ m.Service.Uri = info.Uri
+ }
+
if m.Service.RequestFormat == "" {
m.Service.RequestFormat = info.RequestFormat
}
@@ -490,7 +514,7 @@ func (m *Model) SchemaTemplate() string {
}
if m.schema == "" {
- m.schema = visionschema.Labels(m.PromptContains("nsfw"))
+ m.schema = visionschema.LabelsJson(m.PromptContains("nsfw"))
}
}
})
diff --git a/internal/ai/vision/ollama/defaults.go b/internal/ai/vision/ollama/defaults.go
index 145e710eb..64530def9 100644
--- a/internal/ai/vision/ollama/defaults.go
+++ b/internal/ai/vision/ollama/defaults.go
@@ -1,7 +1,5 @@
package ollama
-import "github.com/photoprism/photoprism/internal/ai/vision/schema"
-
const (
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
@@ -22,12 +20,3 @@ const (
// DefaultResolution is the default thumbnail size submitted to Ollama models.
DefaultResolution = 720
)
-
-// LabelsSchema returns the canonical label schema string consumed by Ollama models.
-func LabelsSchema(nsfw bool) string {
- if nsfw {
- return schema.LabelsNSFW
- } else {
- return schema.LabelsDefault
- }
-}
diff --git a/internal/ai/vision/ollama/schema.go b/internal/ai/vision/ollama/schema.go
new file mode 100644
index 000000000..1bbcd857b
--- /dev/null
+++ b/internal/ai/vision/ollama/schema.go
@@ -0,0 +1,14 @@
+package ollama
+
+import (
+ "github.com/photoprism/photoprism/internal/ai/vision/schema"
+)
+
+// SchemaLabels returns the canonical label schema string consumed by Ollama models.
+//
+// Related documentation and references:
+// - https://www.alibabacloud.com/help/en/model-studio/json-mode
+// - https://www.json.org/json-en.html
+func SchemaLabels(nsfw bool) string {
+ return schema.LabelsJson(nsfw)
+}
diff --git a/internal/ai/vision/ollama/transport.go b/internal/ai/vision/ollama/transport.go
new file mode 100644
index 000000000..bf0be34ab
--- /dev/null
+++ b/internal/ai/vision/ollama/transport.go
@@ -0,0 +1,79 @@
+package ollama
+
+import (
+ "errors"
+ "fmt"
+ "time"
+)
+
+// Response encapsulates the subset of the Ollama generate API response we care about.
+type Response struct {
+ ID string `yaml:"Id,omitempty" json:"id,omitempty"`
+ Code int `yaml:"Code,omitempty" json:"code,omitempty"`
+ Error string `yaml:"Error,omitempty" json:"error,omitempty"`
+ Model string `yaml:"Model,omitempty" json:"model,omitempty"`
+ CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
+ Response string `yaml:"Response,omitempty" json:"response,omitempty"`
+ Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
+ Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
+ TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
+ LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
+ PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
+ PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
+ EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
+ EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
+ Result ResultPayload `yaml:"Result,omitempty" json:"result,omitempty"`
+}
+
+// Err returns an error if the request has failed.
+func (r *Response) Err() error {
+ if r == nil {
+ return errors.New("response is nil")
+ }
+
+ if r.Code >= 400 {
+ if r.Error != "" {
+ return errors.New(r.Error)
+ }
+
+ return fmt.Errorf("error %d", r.Code)
+ } else if len(r.Result.Labels) == 0 && r.Result.Caption == nil {
+ return errors.New("no result")
+ }
+
+ return nil
+}
+
+// HasResult checks if there is at least one result in the response data.
+func (r *Response) HasResult() bool {
+ if r == nil {
+ return false
+ }
+
+ return len(r.Result.Labels) > 0 || r.Result.Caption != nil
+}
+
+// ResultPayload mirrors the structure returned by Ollama for result data.
+type ResultPayload struct {
+ Labels []LabelPayload `json:"labels"`
+ Caption *CaptionPayload `json:"caption,omitempty"`
+}
+
+// LabelPayload represents a single label object emitted by the Ollama adapter.
+type LabelPayload struct {
+ Name string `json:"name"`
+ Source string `json:"source,omitempty"`
+ Priority int `json:"priority,omitempty"`
+ Confidence float32 `json:"confidence,omitempty"`
+ Topicality float32 `json:"topicality,omitempty"`
+ Categories []string `json:"categories,omitempty"`
+ NSFW bool `json:"nsfw,omitempty"`
+ NSFWConfidence float32 `json:"nsfw_confidence,omitempty"`
+}
+
+// CaptionPayload represents the caption object emitted by the Ollama adapter.
+type CaptionPayload struct {
+ Text string `json:"text"`
+ Source string `json:"source,omitempty"`
+ Confidence float32 `json:"confidence,omitempty"`
+}
diff --git a/internal/ai/vision/openai/README.md b/internal/ai/vision/openai/README.md
new file mode 100644
index 000000000..f6d75bc38
--- /dev/null
+++ b/internal/ai/vision/openai/README.md
@@ -0,0 +1,128 @@
+## PhotoPrism — OpenAI API Integration
+
+**Last Updated:** November 14, 2025
+
+### Overview
+
+This package contains PhotoPrism’s adapter for the OpenAI Responses API. It enables existing caption and label workflows (`GenerateCaption`, `GenerateLabels`, and the `photoprism vision run` CLI) to call OpenAI models alongside TensorFlow and Ollama without changing worker or API code. The implementation focuses on predictable results, structured outputs, and clear observability so operators can opt in gradually.
+
+#### Context & Constraints
+
+- OpenAI requests flow through the existing vision client (`internal/ai/vision/api_client.go`) and must honour PhotoPrism’s timeout, logging, and ACL rules.
+- Structured outputs are preferred but the adapter must gracefully handle free-form text; `output_text` responses are parsed both as JSON and as plain captions.
+- Costs should remain predictable: requests are limited to a single 720 px thumbnail (`detail=low`) and capped token budgets (512 caption, 1024 labels).
+- Secrets are supplied per model (`Service.Key`) with fallbacks to `OPENAI_API_KEY` / `_FILE`. Logs must redact sensitive data.
+
+#### Goals
+
+- Provide drop-in OpenAI support for captions and labels using `vision.yml`.
+- Keep configuration ergonomic by auto-populating prompts, schema names, token limits, and sampling defaults.
+- Expose enough logging and tests so operators can compare OpenAI output with existing engines before enabling it broadly.
+
+#### Non-Goals
+
+- Introducing a new `generate` model type or combined caption/label endpoint (reserved for a later phase).
+- Replacing the default TensorFlow models; they remain active as fallbacks.
+- Managing OpenAI billing or quota dashboards beyond surfacing token counts in logs and metrics.
+
+### Prompt, Model, & Schema Guidance
+
+- **Models:** The adapter targets GPT‑5 vision tiers (e.g. `gpt-5-nano`, `gpt-5-mini`). These models support image inputs, structured outputs, and deterministic settings. Set `Name` to the exact provider identifier so defaults are applied correctly. Caption models share the same configuration surface and run through the same adapter.
+- **Prompts:** Defaults live in `defaults.go`. Captions use a single-sentence instruction; labels use `LabelPromptDefault` (or `LabelPromptNSFW` when PhotoPrism requests NSFW metadata). Custom prompts should retain schema reminders so structured outputs stay valid.
+- **Schemas:** Labels use the JSON schema returned by `schema.LabelsJsonSchema(nsfw)`; the response format name is derived via `schema.JsonSchemaName` (e.g. `photoprism_vision_labels_v1`). Captions omit schemas unless operators explicitly request a structured format.
+- **When to keep defaults:** For most deployments, leaving `System`, `Prompt`, `Schema`, and `Options` unset yields stable output with minimal configuration. Override them only when domain-specific language or custom scoring is necessary, and add regression tests alongside.
+
+Budget-conscious operators can experiment with lighter prompts or lower-resolution thumbnails, but should keep token limits and determinism settings intact to avoid unexpected bills and UI churn.
+
+#### Performance & Cost Estimates
+
+- **Token budgets:** Captions request up to 512 output tokens; labels request up to 1024. Input tokens are typically ≤700 for a single 720 px thumbnail plus prompts.
+- **Latency:** GPT‑5 nano/mini vision calls typically complete in 3–8 s, depending on OpenAI region. Including reasoning metadata (`reasoning.effort=low`) has negligible impact but improves traceability.
+- **Costs:** Consult OpenAI’s pricing for the selected model. Multiply input/output tokens by the published rate. PhotoPrism currently sends one image per request to keep costs linear with photo count.
+
+#### Defaults
+
+- File scheme: `data:` URLs (base64) for all OpenAI models.
+- Resolution: 720 px thumbnails (`vision.Thumb(ModelTypeCaption|Labels)`).
+- Options: `MaxOutputTokens` raised to 512 (caption) / 1024 (labels); `ForceJson=false` for captions, `true` for labels; `reasoning.effort="low"`.
+- Sampling: `Temperature` and `TopP` set to `0` for `gpt-5*` models; inherited values (0.1/0.9) remain for other engines. `openaiBuilder.Build` performs this override while preserving the struct defaults for non-OpenAI adapters.
+- Schema naming: Automatically derived via `schema.JsonSchemaName`, so operators may omit `SchemaVersion`.
+
+### Configuration
+
+#### Environment Variables
+
+- `OPENAI_API_KEY` / `OPENAI_API_KEY_FILE` — fallback credentials when a model’s `Service.Key` is unset.
+- Existing `PHOTOPRISM_VISION_*` variables remain authoritative (see the [Developer Guide](https://docs.photoprism.app/developer-guide/vision/service/) for full lists).
+
+#### `vision.yml` Examples
+
+```yaml
+Models:
+ - Type: caption
+ Name: gpt-5-nano
+ Engine: openai
+ Disabled: false # opt in manually
+ Resolution: 720 # optional; default is 720
+ Options:
+ Detail: low # optional; defaults to low
+ MaxOutputTokens: 512
+ Service:
+ Uri: https://api.openai.com/v1/responses
+ FileScheme: data
+ Key: ${OPENAI_API_KEY}
+
+ - Type: labels
+ Name: gpt-5-mini
+ Engine: openai
+ Disabled: false
+ Resolution: 720
+ Options:
+ Detail: low
+ MaxOutputTokens: 1024
+ ForceJson: true # redundant but explicit
+ Service:
+ Uri: https://api.openai.com/v1/responses
+ FileScheme: data
+ Key: ${OPENAI_API_KEY}
+```
+
+Keep TensorFlow entries in place so PhotoPrism falls back when the external service is unavailable.
+
+### Documentation
+
+- Label Generation:
+- Caption Generation:
+- Vision CLI Commands:
+
+### Implementation Details
+
+#### Core Concepts
+
+- **Structured outputs:** PhotoPrism leverages OpenAI’s structured output capability as documented at . When a JSON schema is supplied, the adapter emits `text.format` with `type: "json_schema"` and a schema name derived from the content. The parser then prefers `output_json`, but also attempts to decode `output_text` payloads that contain JSON objects.
+- **Deterministic sampling:** GPT‑5 models are run with `temperature=0` and `top_p=0` to minimise variance, while still allowing developers to override values in `vision.yml` if needed.
+- **Reasoning metadata:** Requests include `reasoning.effort="low"` so OpenAI returns structured reasoning usage counters, helping operators track token consumption.
+- **Worker summaries:** The vision worker now logs either “updated …” or “processed … (no metadata changes detected)”, making reruns easy to audit.
+
+#### Rate Limiting
+
+OpenAI calls respect the existing `limiter.Auth` configuration used by the vision service. Failed requests surface standard HTTP errors and are not automatically retried; operators should ensure they have adequate account limits and consider external rate limiting when sharing credentials.
+
+#### Testing & Validation
+
+1. Unit tests: `go test ./internal/ai/vision/openai ./internal/ai/vision -run OpenAI -count=1`. Fixtures under `internal/ai/vision/openai/testdata/` replay real Responses payloads (captions and labels).
+2. CLI smoke test: `photoprism vision run -m labels --count 1 --force --model=gpt-5-mini` with trace logging enabled to inspect sanitised Responses.
+3. Compare worker summaries and label sources (`openai`) in the UI or via `photoprism vision ls`.
+
+#### Code Map
+
+- **Adapter & defaults:** `internal/ai/vision/openai` (defaults, schema helpers, transport, tests).
+- **Request/response plumbing:** `internal/ai/vision/api_request.go`, `api_client.go`, `engine_openai.go`, `engine_openai_test.go`.
+- **Workers & CLI:** `internal/workers/vision.go`, `internal/commands/vision_run.go`.
+- **Shared utilities:** `internal/ai/vision/schema`, `pkg/clean`, `pkg/media`.
+
+#### Next Steps
+
+- [ ] Introduce the future `generate` model type that combines captions, labels, and optional markers.
+- [ ] Evaluate additional OpenAI models as pricing and capabilities evolve.
+- [ ] Expose token usage metrics (input/output/reasoning) via Prometheus once the schema stabilises.
diff --git a/internal/ai/vision/openai/defaults.go b/internal/ai/vision/openai/defaults.go
index b29b44bea..36f9977dd 100644
--- a/internal/ai/vision/openai/defaults.go
+++ b/internal/ai/vision/openai/defaults.go
@@ -1,6 +1,29 @@
package openai
-import "github.com/photoprism/photoprism/internal/ai/vision/schema"
+const (
+ // CaptionSystem defines the default system prompt for caption models.
+ CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
+ // CaptionPrompt instructs caption models to respond with a single sentence.
+ CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
+ // LabelSystem defines the system prompt for label generation.
+ LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
+ // LabelPromptDefault requests general-purpose labels.
+ LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
+ // LabelPromptNSFW requests labels including NSFW metadata when required.
+ LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
+ // DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
+ DefaultDetail = "low"
+ // CaptionMaxTokens suggests the output budget for caption responses.
+ CaptionMaxTokens = 512
+ // LabelsMaxTokens suggests the output budget for label responses.
+ LabelsMaxTokens = 1024
+ // DefaultTemperature configures deterministic replies.
+ DefaultTemperature = 0.1
+ // DefaultTopP limits nucleus sampling.
+ DefaultTopP = 0.9
+ // DefaultSchemaVersion is used when callers do not specify an explicit schema version.
+ DefaultSchemaVersion = "v1"
+)
var (
// DefaultModel is the model used by default when accessing the OpenAI API.
@@ -8,8 +31,3 @@ var (
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
DefaultResolution = 720
)
-
-// LabelsSchema returns the canonical label schema string consumed by OpenAI models.
-func LabelsSchema() string {
- return schema.LabelsDefault
-}
diff --git a/internal/ai/vision/openai/schema.go b/internal/ai/vision/openai/schema.go
new file mode 100644
index 000000000..0d37aeb84
--- /dev/null
+++ b/internal/ai/vision/openai/schema.go
@@ -0,0 +1,16 @@
+package openai
+
+import (
+ "encoding/json"
+
+ "github.com/photoprism/photoprism/internal/ai/vision/schema"
+)
+
+// SchemaLabels returns the canonical labels JSON Schema string consumed by Ollama models.
+//
+// Related documentation and references:
+// - https://platform.openai.com/docs/guides/structured-outputs
+// - https://json-schema.org/learn/miscellaneous-examples
+func SchemaLabels(nsfw bool) json.RawMessage {
+ return schema.LabelsJsonSchema(nsfw)
+}
diff --git a/internal/ai/vision/openai/testdata/caption-response.json b/internal/ai/vision/openai/testdata/caption-response.json
new file mode 100644
index 000000000..e77eac59c
--- /dev/null
+++ b/internal/ai/vision/openai/testdata/caption-response.json
@@ -0,0 +1,73 @@
+{
+ "id": "resp_0d356718505119f3006916e5d8730881a0b91de2aa700f6196",
+ "object": "response",
+ "created_at": 1763108312,
+ "status": "completed",
+ "background": false,
+ "billing": {
+ "payer": "developer"
+ },
+ "error": null,
+ "incomplete_details": null,
+ "instructions": null,
+ "max_output_tokens": 512,
+ "max_tool_calls": null,
+ "model": "gpt-5-nano-2025-08-07",
+ "output": [
+ {
+ "id": "rs_0d356718505119f3006916e5d8efd481a0a4f9cc1823cc6c83",
+ "type": "reasoning",
+ "summary": []
+ },
+ {
+ "id": "msg_0d356718505119f3006916e5d9433881a0bc79197d2cfc2027",
+ "type": "message",
+ "status": "completed",
+ "content": [
+ {
+ "type": "output_text",
+ "annotations": [],
+ "logprobs": [],
+ "text": "A bee gathers nectar from the vibrant red poppy\u2019s center."
+ }
+ ],
+ "role": "assistant"
+ }
+ ],
+ "parallel_tool_calls": true,
+ "previous_response_id": null,
+ "prompt_cache_key": null,
+ "prompt_cache_retention": null,
+ "reasoning": {
+ "effort": "low",
+ "summary": null
+ },
+ "safety_identifier": null,
+ "service_tier": "default",
+ "store": true,
+ "temperature": 1.0,
+ "text": {
+ "format": {
+ "type": "text"
+ },
+ "verbosity": "medium"
+ },
+ "tool_choice": "auto",
+ "tools": [],
+ "top_logprobs": 0,
+ "top_p": 1.0,
+ "truncation": "disabled",
+ "usage": {
+ "input_tokens": 576,
+ "input_tokens_details": {
+ "cached_tokens": 0
+ },
+ "output_tokens": 19,
+ "output_tokens_details": {
+ "reasoning_tokens": 0
+ },
+ "total_tokens": 595
+ },
+ "user": null,
+ "metadata": {}
+}
diff --git a/internal/ai/vision/openai/testdata/labels-response.json b/internal/ai/vision/openai/testdata/labels-response.json
new file mode 100644
index 000000000..c1cc3deb1
--- /dev/null
+++ b/internal/ai/vision/openai/testdata/labels-response.json
@@ -0,0 +1,114 @@
+{
+ "id": "resp_0fa91dfb69b7d644006916ea0b72ac819f84ff3152a38dfcdb",
+ "object": "response",
+ "created_at": 1763109387,
+ "status": "completed",
+ "background": false,
+ "billing": {
+ "payer": "developer"
+ },
+ "error": null,
+ "incomplete_details": null,
+ "instructions": null,
+ "max_output_tokens": 1024,
+ "max_tool_calls": null,
+ "model": "gpt-5-mini-2025-08-07",
+ "output": [
+ {
+ "id": "rs_0fa91dfb69b7d644006916ea0c3450819f8a13396bf377f474",
+ "type": "reasoning",
+ "summary": []
+ },
+ {
+ "id": "msg_0fa91dfb69b7d644006916ea0d2dfc819faf52b11334fc10a4",
+ "type": "message",
+ "status": "completed",
+ "content": [
+ {
+ "type": "output_text",
+ "annotations": [],
+ "logprobs": [],
+ "text": "{\"labels\":[{\"name\":\"flower\",\"confidence\":0.99,\"topicality\":0.99},{\"name\":\"bee\",\"confidence\":0.95,\"topicality\":0.95},{\"name\":\"petal\",\"confidence\":0.92,\"topicality\":0.88},{\"name\":\"pollen\",\"confidence\":0.85,\"topicality\":0.8},{\"name\":\"insect\",\"confidence\":0.9,\"topicality\":0.85},{\"name\":\"red\",\"confidence\":0.88,\"topicality\":0.6},{\"name\":\"close-up\",\"confidence\":0.86,\"topicality\":0.7},{\"name\":\"nature\",\"confidence\":0.8,\"topicality\":0.5}]}"
+ }
+ ],
+ "role": "assistant"
+ }
+ ],
+ "parallel_tool_calls": true,
+ "previous_response_id": null,
+ "prompt_cache_key": null,
+ "prompt_cache_retention": null,
+ "reasoning": {
+ "effort": "low",
+ "summary": null
+ },
+ "safety_identifier": null,
+ "service_tier": "default",
+ "store": true,
+ "temperature": 1.0,
+ "text": {
+ "format": {
+ "type": "json_schema",
+ "description": null,
+ "name": "photoprism_vision_labels_v1",
+ "schema": {
+ "type": "object",
+ "properties": {
+ "labels": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string",
+ "minLength": 1
+ },
+ "confidence": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ },
+ "topicality": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ }
+ },
+ "required": [
+ "name",
+ "confidence",
+ "topicality"
+ ],
+ "additionalProperties": false
+ },
+ "default": []
+ }
+ },
+ "required": [
+ "labels"
+ ],
+ "additionalProperties": false
+ },
+ "strict": true
+ },
+ "verbosity": "medium"
+ },
+ "tool_choice": "auto",
+ "tools": [],
+ "top_logprobs": 0,
+ "top_p": 1.0,
+ "truncation": "disabled",
+ "usage": {
+ "input_tokens": 724,
+ "input_tokens_details": {
+ "cached_tokens": 0
+ },
+ "output_tokens": 169,
+ "output_tokens_details": {
+ "reasoning_tokens": 0
+ },
+ "total_tokens": 893
+ },
+ "user": null,
+ "metadata": {}
+}
diff --git a/internal/ai/vision/openai/transport.go b/internal/ai/vision/openai/transport.go
new file mode 100644
index 000000000..ee061e149
--- /dev/null
+++ b/internal/ai/vision/openai/transport.go
@@ -0,0 +1,142 @@
+package openai
+
+import (
+ "encoding/json"
+ "strings"
+)
+
+const (
+ // ContentTypeText identifies text input segments for the Responses API.
+ ContentTypeText = "input_text"
+ // ContentTypeImage identifies image input segments for the Responses API.
+ ContentTypeImage = "input_image"
+
+ // ResponseFormatJSONSchema requests JSON constrained by a schema.
+ ResponseFormatJSONSchema = "json_schema"
+ // ResponseFormatJSONObject requests a free-form JSON object.
+ ResponseFormatJSONObject = "json_object"
+)
+
+// HTTPRequest represents the payload expected by OpenAI's Responses API.
+type HTTPRequest struct {
+ Model string `json:"model"`
+ Input []InputMessage `json:"input"`
+ Text *TextOptions `json:"text,omitempty"`
+ Reasoning *Reasoning `json:"reasoning,omitempty"`
+ MaxOutputTokens int `json:"max_output_tokens,omitempty"`
+ Temperature float64 `json:"temperature,omitempty"`
+ TopP float64 `json:"top_p,omitempty"`
+ PresencePenalty float64 `json:"presence_penalty,omitempty"`
+ FrequencyPenalty float64 `json:"frequency_penalty,omitempty"`
+}
+
+// TextOptions carries formatting preferences for textual responses.
+type TextOptions struct {
+ Format *ResponseFormat `json:"format,omitempty"`
+}
+
+// Reasoning configures the effort level for reasoning models.
+type Reasoning struct {
+ Effort string `json:"effort,omitempty"`
+}
+
+// InputMessage captures a single system or user message in the request.
+type InputMessage struct {
+ Role string `json:"role"`
+ Type string `json:"type,omitempty"`
+ Content []ContentItem `json:"content"`
+}
+
+// ContentItem represents a text or image entry within a message.
+type ContentItem struct {
+ Type string `json:"type"`
+ Text string `json:"text,omitempty"`
+ ImageURL string `json:"image_url,omitempty"`
+ Detail string `json:"detail,omitempty"`
+}
+
+// ResponseFormat describes how OpenAI should format its response.
+type ResponseFormat struct {
+ Type string `json:"type"`
+ Name string `json:"name,omitempty"`
+ Schema json.RawMessage `json:"schema,omitempty"`
+ Description string `json:"description,omitempty"`
+ Strict bool `json:"strict,omitempty"`
+}
+
+// Response mirrors the subset of the Responses API response we need.
+type Response struct {
+ ID string `json:"id"`
+ Model string `json:"model"`
+ Output []ResponseOutput `json:"output"`
+ Error *struct {
+ Message string `json:"message"`
+ Type string `json:"type"`
+ } `json:"error,omitempty"`
+}
+
+// ResponseOutput captures assistant messages within the response.
+type ResponseOutput struct {
+ Role string `json:"role"`
+ Content []ResponseContent `json:"content"`
+}
+
+// ResponseContent contains individual message parts (JSON or text).
+type ResponseContent struct {
+ Type string `json:"type"`
+ Text string `json:"text,omitempty"`
+ JSON json.RawMessage `json:"json,omitempty"`
+}
+
+// FirstJSON returns the first JSON payload contained in the response.
+func (r *Response) FirstJSON() json.RawMessage {
+ if r == nil {
+ return nil
+ }
+
+ for i := range r.Output {
+ for j := range r.Output[i].Content {
+ if len(r.Output[i].Content[j].JSON) > 0 {
+ return r.Output[i].Content[j].JSON
+ }
+ }
+ }
+
+ return nil
+}
+
+// FirstText returns the first textual payload contained in the response.
+func (r *Response) FirstText() string {
+ if r == nil {
+ return ""
+ }
+
+ for i := range r.Output {
+ for j := range r.Output[i].Content {
+ if text := strings.TrimSpace(r.Output[i].Content[j].Text); text != "" {
+ return text
+ }
+ }
+ }
+
+ return ""
+}
+
+// ParseErrorMessage extracts a human readable error message from a Responses API payload.
+func ParseErrorMessage(raw []byte) string {
+ var errResp struct {
+ Error *struct {
+ Message string `json:"message"`
+ } `json:"error"`
+ }
+
+ if err := json.Unmarshal(raw, &errResp); err != nil {
+ return ""
+ }
+
+ if errResp.Error != nil {
+ return strings.TrimSpace(errResp.Error.Message)
+ }
+
+ return ""
+}
diff --git a/internal/ai/vision/openai/transport_test.go b/internal/ai/vision/openai/transport_test.go
new file mode 100644
index 000000000..6141ea4f6
--- /dev/null
+++ b/internal/ai/vision/openai/transport_test.go
@@ -0,0 +1,120 @@
+package openai
+
+import (
+ "encoding/json"
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+func loadTestResponse(t *testing.T, name string) *Response {
+ t.Helper()
+
+ filePath := filepath.Join("testdata", name)
+
+ data, err := os.ReadFile(filePath)
+ if err != nil {
+ t.Fatalf("failed to read %s: %v", filePath, err)
+ }
+
+ var resp Response
+ if err := json.Unmarshal(data, &resp); err != nil {
+ t.Fatalf("failed to unmarshal %s: %v", filePath, err)
+ }
+
+ return &resp
+}
+
+func TestParseErrorMessage(t *testing.T) {
+ t.Run("returns message when present", func(t *testing.T) {
+ raw := []byte(`{"error":{"message":"Invalid schema"}}`)
+ msg := ParseErrorMessage(raw)
+ if msg != "Invalid schema" {
+ t.Fatalf("expected message, got %q", msg)
+ }
+ })
+
+ t.Run("returns empty string when error is missing", func(t *testing.T) {
+ raw := []byte(`{"output":[]}`)
+ if msg := ParseErrorMessage(raw); msg != "" {
+ t.Fatalf("expected empty message, got %q", msg)
+ }
+ })
+}
+
+func TestResponseFirstTextCaption(t *testing.T) {
+ resp := loadTestResponse(t, "caption-response.json")
+
+ if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
+ t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
+ }
+
+ text := resp.FirstText()
+ expected := "A bee gathers nectar from the vibrant red poppy’s center."
+ if text != expected {
+ t.Fatalf("unexpected caption text: %q", text)
+ }
+}
+
+func TestResponseFirstTextLabels(t *testing.T) {
+ resp := loadTestResponse(t, "labels-response.json")
+
+ if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
+ t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
+ }
+
+ text := resp.FirstText()
+ if len(text) == 0 {
+ t.Fatal("expected structured JSON string in text payload")
+ }
+ if text[0] != '{' {
+ t.Fatalf("expected JSON object in text payload, got %q", text)
+ }
+}
+
+func TestResponseFirstJSONFromStructuredPayload(t *testing.T) {
+ resp := &Response{
+ ID: "resp_structured",
+ Model: "gpt-5-mini",
+ Output: []ResponseOutput{
+ {
+ Role: "assistant",
+ Content: []ResponseContent{
+ {
+ Type: "output_json",
+ JSON: json.RawMessage(`{"labels":[{"name":"sunset"}]}`),
+ },
+ },
+ },
+ },
+ }
+
+ jsonPayload := resp.FirstJSON()
+ if len(jsonPayload) == 0 {
+ t.Fatal("expected JSON payload, got empty result")
+ }
+
+ var decoded struct {
+ Labels []map[string]string `json:"labels"`
+ }
+ if err := json.Unmarshal(jsonPayload, &decoded); err != nil {
+ t.Fatalf("failed to decode JSON payload: %v", err)
+ }
+
+ if len(decoded.Labels) != 1 || decoded.Labels[0]["name"] != "sunset" {
+ t.Fatalf("unexpected JSON payload: %+v", decoded.Labels)
+ }
+}
+
+func TestSchemaLabelsReturnsValidJSON(t *testing.T) {
+ raw := SchemaLabels(false)
+
+ var decoded map[string]any
+ if err := json.Unmarshal(raw, &decoded); err != nil {
+ t.Fatalf("schema should be valid JSON: %v", err)
+ }
+
+ if decoded["type"] != "object" {
+ t.Fatalf("expected type object, got %v", decoded["type"])
+ }
+}
diff --git a/internal/ai/vision/schema/labels.go b/internal/ai/vision/schema/labels.go
index 735a70cb9..6ecd26afa 100644
--- a/internal/ai/vision/schema/labels.go
+++ b/internal/ai/vision/schema/labels.go
@@ -1,16 +1,115 @@
package schema
-// LabelsDefault provides the minimal JSON schema for label responses used across engines.
-const (
- LabelsDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
- LabelsNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
+import (
+ "encoding/json"
)
-// Labels returns the canonical label schema string.
-func Labels(nsfw bool) string {
+// LabelsJsonSchemaDefault provides the minimal JSON schema for label responses used across engines.
+const (
+ LabelsJsonSchemaDefault = `{
+ "type": "object",
+ "properties": {
+ "labels": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string",
+ "minLength": 1
+ },
+ "confidence": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ },
+ "topicality": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ }
+ },
+ "required": ["name", "confidence", "topicality"],
+ "additionalProperties": false
+ },
+ "default": []
+ }
+ },
+ "required": ["labels"],
+ "additionalProperties": false
+}`
+ LabelsJsonDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
+ LabelsJsonSchemaNSFW = `{
+ "type": "object",
+ "properties": {
+ "labels": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string",
+ "minLength": 1
+ },
+ "confidence": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ },
+ "topicality": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ },
+ "nsfw": {
+ "type": "boolean"
+ },
+ "nsfw_confidence": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ }
+ },
+ "required": [
+ "name",
+ "confidence",
+ "topicality",
+ "nsfw",
+ "nsfw_confidence"
+ ],
+ "additionalProperties": false
+ },
+ "default": []
+ }
+ },
+ "required": ["labels"],
+ "additionalProperties": false
+}`
+ LabelsJsonNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
+)
+
+// LabelsJsonSchema returns the canonical label JSON Schema string for OpenAI API endpoints.
+//
+// Related documentation and references:
+// - https://platform.openai.com/docs/guides/structured-outputs
+// - https://json-schema.org/learn/miscellaneous-examples
+func LabelsJsonSchema(nsfw bool) json.RawMessage {
if nsfw {
- return LabelsNSFW
+ return json.RawMessage(LabelsJsonSchemaNSFW)
} else {
- return LabelsDefault
+ return json.RawMessage(LabelsJsonSchemaDefault)
+ }
+}
+
+// LabelsJson returns the canonical label JSON string for Ollama vision models.
+//
+// Related documentation and references:
+// - https://www.alibabacloud.com/help/en/model-studio/json-mode
+// - https://www.json.org/json-en.html
+func LabelsJson(nsfw bool) string {
+ if nsfw {
+ return LabelsJsonNSFW
+ } else {
+ return LabelsJsonDefault
}
}
diff --git a/internal/ai/vision/schema/name.go b/internal/ai/vision/schema/name.go
new file mode 100644
index 000000000..2f642f2f3
--- /dev/null
+++ b/internal/ai/vision/schema/name.go
@@ -0,0 +1,36 @@
+package schema
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+
+ "github.com/photoprism/photoprism/pkg/clean"
+)
+
+const (
+ NamePrefix = "photoprism_vision"
+)
+
+// JsonSchemaName returns the schema version string to be used for API requests.
+func JsonSchemaName(schema json.RawMessage, version string) string {
+ var schemaName string
+
+ switch {
+ case bytes.Contains(schema, []byte("labels")):
+ schemaName = "labels"
+ case bytes.Contains(schema, []byte("labels")):
+ schemaName = "caption"
+ default:
+ schemaName = "schema"
+ }
+
+ version = clean.TypeLowerUnderscore(version)
+
+ if version == "" {
+ version = "v1"
+ }
+
+ return fmt.Sprintf("%s_%s_%s", NamePrefix, schemaName, version)
+
+}
diff --git a/internal/ai/vision/schema/name_test.go b/internal/ai/vision/schema/name_test.go
new file mode 100644
index 000000000..c2d0897f7
--- /dev/null
+++ b/internal/ai/vision/schema/name_test.go
@@ -0,0 +1,23 @@
+package schema
+
+import (
+ "encoding/json"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestJsonSchemaName(t *testing.T) {
+ t.Run("Default", func(t *testing.T) {
+ assert.Equal(t, "photoprism_vision_schema_v1", JsonSchemaName(nil, ""))
+ })
+ t.Run("Labels", func(t *testing.T) {
+ assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(json.RawMessage(LabelsJsonSchemaDefault), ""))
+ })
+ t.Run("LabelsV1", func(t *testing.T) {
+ assert.Equal(t, "photoprism_vision_labels_v2", JsonSchemaName([]byte("labels"), "v2"))
+ })
+ t.Run("LabelsJsonSchema", func(t *testing.T) {
+ assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(LabelsJsonSchema(false), "v1"))
+ })
+}
diff --git a/internal/ai/vision/schema/schema.go b/internal/ai/vision/schema/schema.go
index 87a1dc6fc..4801a8477 100644
--- a/internal/ai/vision/schema/schema.go
+++ b/internal/ai/vision/schema/schema.go
@@ -1,5 +1,5 @@
/*
-Package schema defines canonical JSON schema templates shared by PhotoPrism's AI vision engines.
+Package schema defines canonical JSON and JSON Schema templates shared by PhotoPrism's AI vision engines.
Copyright (c) 2018 - 2025 PhotoPrism UG. All rights reserved.
diff --git a/internal/config/feat/vision.go b/internal/config/feat/vision.go
index ae415a829..16d95045b 100644
--- a/internal/config/feat/vision.go
+++ b/internal/config/feat/vision.go
@@ -4,5 +4,5 @@ package feat
var (
VisionModelGenerate = false // controls exposure of the generate endpoint and CLI commands
VisionModelMarkers = false // gates marker generation/return until downstream UI and reconciliation paths are ready
- VisionServiceOpenAI = false // controls whether users are able to configure OpenAI as a vision service engine
+ VisionServiceOpenAI = true // controls whether users are able to configure OpenAI as a vision service engine
)
diff --git a/internal/workers/vision.go b/internal/workers/vision.go
index 4d85b6dbd..597432d9a 100644
--- a/internal/workers/vision.go
+++ b/internal/workers/vision.go
@@ -135,6 +135,7 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
done := make(map[string]bool)
offset := 0
updated := 0
+ processed := 0
// Make sure count is within
if count < 1 || count > search.MaxResults {
@@ -197,6 +198,8 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
continue
}
+ processed++
+
fileName := photoprism.FileName(photo.FileRoot, photo.FileName)
file, fileErr := photoprism.NewMediaFile(fileName)
@@ -279,7 +282,18 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
}
}
- log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), time.Since(start))
+ elapsed := time.Since(start)
+
+ switch {
+ case processed == 0:
+ log.Infof("vision: no pictures required processing [%s]", elapsed)
+ case updated == processed:
+ log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), elapsed)
+ case updated == 0:
+ log.Infof("vision: processed %s (no metadata changes detected) [%s]", english.Plural(processed, "picture", "pictures"), elapsed)
+ default:
+ log.Infof("vision: updated %s out of %s [%s]", english.Plural(updated, "picture", "pictures"), english.Plural(processed, "picture", "pictures"), elapsed)
+ }
if updated > 0 {
updateIndex = true