diff --git a/internal/ai/vision/api_client.go b/internal/ai/vision/api_client.go index 29c24b259..62f757dfe 100644 --- a/internal/ai/vision/api_client.go +++ b/internal/ai/vision/api_client.go @@ -9,6 +9,9 @@ import ( "io" "net/http" + "github.com/sirupsen/logrus" + + "github.com/photoprism/photoprism/internal/ai/vision/ollama" "github.com/photoprism/photoprism/pkg/clean" "github.com/photoprism/photoprism/pkg/http/header" ) @@ -69,6 +72,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp return nil, parseErr } + if log.IsLevelEnabled(logrus.TraceLevel) { + log.Tracef("vision: response %s", string(body)) + } + return parsed, nil } @@ -89,12 +96,12 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp return apiResponse, nil } -func decodeOllamaResponse(data []byte) (*ApiResponseOllama, error) { - resp := &ApiResponseOllama{} +func decodeOllamaResponse(data []byte) (*ollama.Response, error) { + resp := &ollama.Response{} dec := json.NewDecoder(bytes.NewReader(data)) for { - var chunk ApiResponseOllama + var chunk ollama.Response if err := dec.Decode(&chunk); err != nil { if errors.Is(err, io.EOF) { break diff --git a/internal/ai/vision/api_client_test.go b/internal/ai/vision/api_client_test.go index d933a1227..d84219086 100644 --- a/internal/ai/vision/api_client_test.go +++ b/internal/ai/vision/api_client_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/assert" + "github.com/photoprism/photoprism/internal/ai/vision/ollama" "github.com/photoprism/photoprism/pkg/http/scheme" ) @@ -49,7 +50,7 @@ func TestPerformApiRequestOllama(t *testing.T) { var req ApiRequest assert.NoError(t, json.NewDecoder(r.Body).Decode(&req)) assert.Equal(t, FormatJSON, req.Format) - assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{ + assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{ Model: "qwen2.5vl:latest", Response: `{"labels":[{"name":"test","confidence":0.9,"topicality":0.8}]}`, })) @@ -72,7 +73,7 @@ func TestPerformApiRequestOllama(t *testing.T) { }) t.Run("LabelsWithCodeFence", func(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{ + assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{ Model: "gemma3:latest", Response: "```json\n{\"labels\":[{\"name\":\"lingerie\",\"confidence\":0.81,\"topicality\":0.73}]}\n```\nThe model provided additional commentary.", })) @@ -95,7 +96,7 @@ func TestPerformApiRequestOllama(t *testing.T) { }) t.Run("CaptionFallback", func(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{ + assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{ Model: "qwen2.5vl:latest", Response: "plain text", })) diff --git a/internal/ai/vision/api_ollama.go b/internal/ai/vision/api_ollama.go index 56f70d454..8231234c6 100644 --- a/internal/ai/vision/api_ollama.go +++ b/internal/ai/vision/api_ollama.go @@ -1,10 +1,8 @@ package vision import ( - "errors" "fmt" "os" - "time" "github.com/photoprism/photoprism/pkg/clean" "github.com/photoprism/photoprism/pkg/http/scheme" @@ -12,53 +10,6 @@ import ( "github.com/photoprism/photoprism/pkg/rnd" ) -// ApiResponseOllama represents a Ollama API service response. -type ApiResponseOllama struct { - Id string `yaml:"Id,omitempty" json:"id,omitempty"` - Code int `yaml:"Code,omitempty" json:"code,omitempty"` - Error string `yaml:"Error,omitempty" json:"error,omitempty"` - Model string `yaml:"Model,omitempty" json:"model,omitempty"` - CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"` - Response string `yaml:"Response,omitempty" json:"response,omitempty"` - Done bool `yaml:"Done,omitempty" json:"done,omitempty"` - Context []int `yaml:"Context,omitempty" json:"context,omitempty"` - TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"` - LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"` - PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"` - PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"` - EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"` - EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"` - Result ApiResult `yaml:"Result,omitempty" json:"result,omitempty"` -} - -// Err returns an error if the request has failed. -func (r *ApiResponseOllama) Err() error { - if r == nil { - return errors.New("response is nil") - } - - if r.Code >= 400 { - if r.Error != "" { - return errors.New(r.Error) - } - - return fmt.Errorf("error %d", r.Code) - } else if r.Result.IsEmpty() { - return errors.New("no result") - } - - return nil -} - -// HasResult checks if there is at least one result in the response data. -func (r *ApiResponseOllama) HasResult() bool { - if r == nil { - return false - } - - return !r.Result.IsEmpty() -} - // NewApiRequestOllama returns a new Ollama API request with the specified images as payload. func NewApiRequestOllama(images Files, fileScheme scheme.Type) (*ApiRequest, error) { imagesData := make(Files, len(images)) diff --git a/internal/ai/vision/api_request.go b/internal/ai/vision/api_request.go index 6ca0450bb..f227b7a75 100644 --- a/internal/ai/vision/api_request.go +++ b/internal/ai/vision/api_request.go @@ -11,6 +11,8 @@ import ( "github.com/sirupsen/logrus" + "github.com/photoprism/photoprism/internal/ai/vision/openai" + "github.com/photoprism/photoprism/internal/ai/vision/schema" "github.com/photoprism/photoprism/internal/api/download" "github.com/photoprism/photoprism/pkg/clean" "github.com/photoprism/photoprism/pkg/fs" @@ -58,6 +60,11 @@ type ApiRequestOptions struct { UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"` UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"` NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"` + MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"` + Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"` + ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"` + SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"` + CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"` } // ApiRequestContext represents a context parameter returned from a previous request. @@ -77,6 +84,7 @@ type ApiRequest struct { Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"` Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"` Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"` + Schema json.RawMessage `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"` ResponseFormat ApiFormat `form:"-" yaml:"-" json:"-"` } @@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat { // JSON returns the request data as JSON-encoded bytes. func (r *ApiRequest) JSON() ([]byte, error) { + if r == nil { + return nil, errors.New("api request is nil") + } + + if r.ResponseFormat == ApiFormatOpenAI { + return r.openAIJSON() + } + return json.Marshal(*r) } @@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest { sanitized.Url = sanitizeLogPayload(r.Url) + sanitized.Schema = r.Schema + return sanitized } @@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool { return true } + +// openAIJSON converts the request data into an OpenAI Responses API payload. +func (r *ApiRequest) openAIJSON() ([]byte, error) { + detail := openai.DefaultDetail + + if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" { + detail = strings.TrimSpace(opts.Detail) + } + + messages := make([]openai.InputMessage, 0, 2) + + if system := strings.TrimSpace(r.System); system != "" { + messages = append(messages, openai.InputMessage{ + Role: "system", + Type: "message", + Content: []openai.ContentItem{ + { + Type: openai.ContentTypeText, + Text: system, + }, + }, + }) + } + + userContent := make([]openai.ContentItem, 0, len(r.Images)+1) + + if prompt := strings.TrimSpace(r.Prompt); prompt != "" { + userContent = append(userContent, openai.ContentItem{ + Type: openai.ContentTypeText, + Text: prompt, + }) + } + + for _, img := range r.Images { + if img == "" { + continue + } + + userContent = append(userContent, openai.ContentItem{ + Type: openai.ContentTypeImage, + ImageURL: img, + Detail: detail, + }) + } + + if len(userContent) > 0 { + messages = append(messages, openai.InputMessage{ + Role: "user", + Type: "message", + Content: userContent, + }) + } + + if len(messages) == 0 { + return nil, errors.New("openai request requires at least one message") + } + + payload := openai.HTTPRequest{ + Model: strings.TrimSpace(r.Model), + Input: messages, + } + + if payload.Model == "" { + payload.Model = openai.DefaultModel + } + + if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") { + payload.Reasoning = &openai.Reasoning{Effort: "low"} + } + + if opts := r.Options; opts != nil { + if opts.MaxOutputTokens > 0 { + payload.MaxOutputTokens = opts.MaxOutputTokens + } + + if opts.Temperature > 0 { + payload.Temperature = opts.Temperature + } + + if opts.TopP > 0 { + payload.TopP = opts.TopP + } + + if opts.PresencePenalty != 0 { + payload.PresencePenalty = opts.PresencePenalty + } + + if opts.FrequencyPenalty != 0 { + payload.FrequencyPenalty = opts.FrequencyPenalty + } + } + + if format := buildOpenAIResponseFormat(r); format != nil { + payload.Text = &openai.TextOptions{ + Format: format, + } + } + + return json.Marshal(payload) +} + +// buildOpenAIResponseFormat determines which response_format to send to OpenAI. +func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat { + if r == nil { + return nil + } + + opts := r.Options + hasSchema := len(r.Schema) > 0 + + if !hasSchema && (opts == nil || !opts.ForceJson) { + return nil + } + + result := &openai.ResponseFormat{} + + if hasSchema { + result.Type = openai.ResponseFormatJSONSchema + result.Schema = r.Schema + + if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" { + result.Name = strings.TrimSpace(opts.SchemaVersion) + } else { + result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion) + } + } else { + result.Type = openai.ResponseFormatJSONObject + } + + return result +} diff --git a/internal/ai/vision/caption.go b/internal/ai/vision/caption.go index 585795f7b..6d27eae32 100644 --- a/internal/ai/vision/caption.go +++ b/internal/ai/vision/caption.go @@ -53,7 +53,11 @@ func captionInternal(images Files, mediaSrc media.Src) (result *CaptionResult, m apiRequest.System = model.GetSystemPrompt() apiRequest.Prompt = model.GetPrompt() - apiRequest.Options = model.GetOptions() + + if apiRequest.Options == nil { + apiRequest.Options = model.GetOptions() + } + apiRequest.WriteLog() if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil { diff --git a/internal/ai/vision/engine.go b/internal/ai/vision/engine.go index 8839fc97f..67da799ef 100644 --- a/internal/ai/vision/engine.go +++ b/internal/ai/vision/engine.go @@ -58,14 +58,15 @@ func init() { RegisterEngineAlias(EngineVision, EngineInfo{ RequestFormat: ApiFormatVision, ResponseFormat: ApiFormatVision, - FileScheme: string(scheme.Data), + FileScheme: scheme.Data, DefaultResolution: DefaultResolution, }) RegisterEngineAlias(openai.EngineName, EngineInfo{ + Uri: "https://api.openai.com/v1/responses", RequestFormat: ApiFormatOpenAI, ResponseFormat: ApiFormatOpenAI, - FileScheme: string(scheme.Data), + FileScheme: scheme.Data, DefaultResolution: openai.DefaultResolution, }) } @@ -79,6 +80,7 @@ func RegisterEngine(format ApiFormat, engine Engine) { // EngineInfo describes metadata that can be associated with an engine alias. type EngineInfo struct { + Uri string RequestFormat ApiFormat ResponseFormat ApiFormat FileScheme string diff --git a/internal/ai/vision/engine_ollama.go b/internal/ai/vision/engine_ollama.go index 413078dc4..816505037 100644 --- a/internal/ai/vision/engine_ollama.go +++ b/internal/ai/vision/engine_ollama.go @@ -28,7 +28,7 @@ func init() { RegisterEngineAlias(ollama.EngineName, EngineInfo{ RequestFormat: ApiFormatOllama, ResponseFormat: ApiFormatOllama, - FileScheme: string(scheme.Base64), + FileScheme: scheme.Base64, DefaultResolution: ollama.DefaultResolution, }) @@ -72,7 +72,7 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string { switch model.Type { case ModelTypeLabels: - return ollama.LabelsSchema(model.PromptContains("nsfw")) + return ollama.SchemaLabels(model.PromptContains("nsfw")) } return "" @@ -134,64 +134,93 @@ func (ollamaParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, stat return nil, err } - result := &ApiResponse{ + response := &ApiResponse{ Id: req.GetId(), Code: status, Model: &Model{Name: ollamaResp.Model}, Result: ApiResult{ - Labels: append([]LabelResult{}, ollamaResp.Result.Labels...), - Caption: func() *CaptionResult { - if ollamaResp.Result.Caption != nil { - copyCaption := *ollamaResp.Result.Caption - return ©Caption - } - return nil - }(), + Labels: convertOllamaLabels(ollamaResp.Result.Labels), + Caption: convertOllamaCaption(ollamaResp.Result.Caption), }, } - parsedLabels := len(result.Result.Labels) > 0 + parsedLabels := len(response.Result.Labels) > 0 if !parsedLabels && strings.TrimSpace(ollamaResp.Response) != "" && req.Format == FormatJSON { if labels, parseErr := parseOllamaLabels(ollamaResp.Response); parseErr != nil { log.Debugf("vision: %s (parse ollama labels)", clean.Error(parseErr)) } else if len(labels) > 0 { - result.Result.Labels = append(result.Result.Labels, labels...) + response.Result.Labels = append(response.Result.Labels, labels...) parsedLabels = true } } if parsedLabels { - filtered := result.Result.Labels[:0] - for i := range result.Result.Labels { - if result.Result.Labels[i].Confidence <= 0 { - result.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault + filtered := response.Result.Labels[:0] + for i := range response.Result.Labels { + if response.Result.Labels[i].Confidence <= 0 { + response.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault } - if result.Result.Labels[i].Topicality <= 0 { - result.Result.Labels[i].Topicality = result.Result.Labels[i].Confidence + if response.Result.Labels[i].Topicality <= 0 { + response.Result.Labels[i].Topicality = response.Result.Labels[i].Confidence } // Apply thresholds and canonicalize the name. - normalizeLabelResult(&result.Result.Labels[i]) + normalizeLabelResult(&response.Result.Labels[i]) - if result.Result.Labels[i].Name == "" { + if response.Result.Labels[i].Name == "" { continue } - if result.Result.Labels[i].Source == "" { - result.Result.Labels[i].Source = entity.SrcOllama + if response.Result.Labels[i].Source == "" { + response.Result.Labels[i].Source = entity.SrcOllama } - filtered = append(filtered, result.Result.Labels[i]) + filtered = append(filtered, response.Result.Labels[i]) } - result.Result.Labels = filtered + response.Result.Labels = filtered } else if caption := strings.TrimSpace(ollamaResp.Response); caption != "" { - result.Result.Caption = &CaptionResult{ + response.Result.Caption = &CaptionResult{ Text: caption, Source: entity.SrcOllama, } } - return result, nil + return response, nil +} + +func convertOllamaLabels(payload []ollama.LabelPayload) []LabelResult { + if len(payload) == 0 { + return nil + } + + labels := make([]LabelResult, len(payload)) + + for i := range payload { + labels[i] = LabelResult{ + Name: payload[i].Name, + Source: payload[i].Source, + Priority: payload[i].Priority, + Confidence: payload[i].Confidence, + Topicality: payload[i].Topicality, + Categories: payload[i].Categories, + NSFW: payload[i].NSFW, + NSFWConfidence: payload[i].NSFWConfidence, + } + } + + return labels +} + +func convertOllamaCaption(payload *ollama.CaptionPayload) *CaptionResult { + if payload == nil { + return nil + } + + return &CaptionResult{ + Text: payload.Text, + Source: payload.Source, + Confidence: payload.Confidence, + } } diff --git a/internal/ai/vision/engine_ollama_test.go b/internal/ai/vision/engine_ollama_test.go index dffc6fe7d..44d62bdeb 100644 --- a/internal/ai/vision/engine_ollama_test.go +++ b/internal/ai/vision/engine_ollama_test.go @@ -10,9 +10,9 @@ import ( func TestOllamaDefaultConfidenceApplied(t *testing.T) { req := &ApiRequest{Format: FormatJSON} - payload := ApiResponseOllama{ - Result: ApiResult{ - Labels: []LabelResult{{Name: "forest path", Confidence: 0, Topicality: 0}}, + payload := ollama.Response{ + Result: ollama.ResultPayload{ + Labels: []ollama.LabelPayload{{Name: "forest path", Confidence: 0, Topicality: 0}}, }, } raw, err := json.Marshal(payload) diff --git a/internal/ai/vision/engine_openai.go b/internal/ai/vision/engine_openai.go index 8a73c2431..3cb2a226e 100644 --- a/internal/ai/vision/engine_openai.go +++ b/internal/ai/vision/engine_openai.go @@ -1,18 +1,342 @@ package vision import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "github.com/photoprism/photoprism/internal/ai/vision/openai" + "github.com/photoprism/photoprism/internal/entity" + "github.com/photoprism/photoprism/pkg/clean" "github.com/photoprism/photoprism/pkg/http/scheme" ) -// init registers the OpenAI engine alias so models can set Engine: "openai" -// and inherit sensible defaults (request/response formats, file scheme, and -// preferred thumbnail resolution). +// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines. +type openaiDefaults struct{} + +// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API. +type openaiBuilder struct{} + +// openaiParser converts Responses API payloads into ApiResponse instances. +type openaiParser struct{} + func init() { - RegisterEngineAlias(openai.EngineName, EngineInfo{ - RequestFormat: ApiFormatOpenAI, - ResponseFormat: ApiFormatOpenAI, - FileScheme: string(scheme.Base64), - DefaultResolution: openai.DefaultResolution, + RegisterEngine(ApiFormatOpenAI, Engine{ + Builder: openaiBuilder{}, + Parser: openaiParser{}, + Defaults: openaiDefaults{}, }) } + +// SystemPrompt returns the default OpenAI system prompt for the specified model type. +func (openaiDefaults) SystemPrompt(model *Model) string { + if model == nil { + return "" + } + + switch model.Type { + case ModelTypeCaption: + return openai.CaptionSystem + case ModelTypeLabels: + return openai.LabelSystem + default: + return "" + } +} + +// UserPrompt returns the default OpenAI user prompt for the specified model type. +func (openaiDefaults) UserPrompt(model *Model) string { + if model == nil { + return "" + } + + switch model.Type { + case ModelTypeCaption: + return openai.CaptionPrompt + case ModelTypeLabels: + if DetectNSFWLabels { + return openai.LabelPromptNSFW + } + return openai.LabelPromptDefault + default: + return "" + } +} + +// SchemaTemplate returns the JSON schema template for the model, if applicable. +func (openaiDefaults) SchemaTemplate(model *Model) string { + if model == nil { + return "" + } + + switch model.Type { + case ModelTypeLabels: + return string(openai.SchemaLabels(model.PromptContains("nsfw"))) + default: + return "" + } +} + +// Options returns default OpenAI request options for the model. +func (openaiDefaults) Options(model *Model) *ApiRequestOptions { + if model == nil { + return nil + } + + switch model.Type { + case ModelTypeCaption: + /* + Options: + Detail: low + MaxOutputTokens: 512 + Temperature: 0.1 + TopP: 0.9 + (Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.) + */ + return &ApiRequestOptions{ + Detail: openai.DefaultDetail, + MaxOutputTokens: openai.CaptionMaxTokens, + Temperature: openai.DefaultTemperature, + TopP: openai.DefaultTopP, + } + case ModelTypeLabels: + /* + Options: + Detail: low + MaxOutputTokens: 1024 + Temperature: 0.1 + ForceJson: true + SchemaVersion: "photoprism_vision_labels_v1" + (Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.) + */ + return &ApiRequestOptions{ + Detail: openai.DefaultDetail, + MaxOutputTokens: openai.LabelsMaxTokens, + Temperature: openai.DefaultTemperature, + TopP: openai.DefaultTopP, + ForceJson: true, + } + default: + return nil + } +} + +// Build constructs an OpenAI request payload using base64-encoded thumbnails. +func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) { + if model == nil { + return nil, ErrInvalidModel + } + + dataReq, err := NewApiRequestImages(files, scheme.Data) + if err != nil { + return nil, err + } + + req := &ApiRequest{ + Id: dataReq.Id, + Images: append(Files(nil), dataReq.Images...), + ResponseFormat: ApiFormatOpenAI, + } + + if opts := model.GetOptions(); opts != nil { + req.Options = cloneOptions(opts) + if model.Type == ModelTypeCaption { + // Captions default to plain text responses; structured JSON is optional. + req.Options.ForceJson = false + if req.Options.MaxOutputTokens < openai.CaptionMaxTokens { + req.Options.MaxOutputTokens = openai.CaptionMaxTokens + } + } else if model.Type == ModelTypeLabels { + if req.Options.MaxOutputTokens < openai.LabelsMaxTokens { + req.Options.MaxOutputTokens = openai.LabelsMaxTokens + } + } + + if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") { + req.Options.Temperature = 0 + req.Options.TopP = 0 + } + } + + if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" { + if raw, parseErr := parseOpenAISchema(schema); parseErr != nil { + log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr)) + } else { + req.Schema = raw + } + } + + return req, nil +} + +// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation. +func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) { + if status >= 300 { + if msg := openai.ParseErrorMessage(raw); msg != "" { + return nil, fmt.Errorf("openai: %s", msg) + } + return nil, fmt.Errorf("openai: status %d", status) + } + + var resp openai.Response + if err := json.Unmarshal(raw, &resp); err != nil { + return nil, err + } + + if resp.Error != nil && resp.Error.Message != "" { + return nil, errors.New(resp.Error.Message) + } + + result := ApiResult{} + if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 { + if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil { + log.Debugf("vision: %s (parse openai json payload)", clean.Error(err)) + } + } + + if result.Caption == nil { + if text := resp.FirstText(); text != "" { + trimmed := strings.TrimSpace(text) + var parsedJSON bool + + if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') { + if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil { + log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err)) + } else { + parsedJSON = true + } + } + + if !parsedJSON && trimmed != "" { + result.Caption = &CaptionResult{ + Text: trimmed, + Source: entity.SrcOpenAI, + } + } + } + } + + var responseID string + if req != nil { + responseID = req.GetId() + } + + modelName := strings.TrimSpace(resp.Model) + if modelName == "" && req != nil { + modelName = strings.TrimSpace(req.Model) + } + + return &ApiResponse{ + Id: responseID, + Code: status, + Model: &Model{Name: modelName}, + Result: result, + }, nil +} + +// parseOpenAISchema validates the provided JSON schema and returns it as a raw message. +func parseOpenAISchema(schema string) (json.RawMessage, error) { + var raw json.RawMessage + if err := json.Unmarshal([]byte(schema), &raw); err != nil { + return nil, err + } + return normalizeOpenAISchema(raw) +} + +// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with +// OpenAI's json_schema format requirements. +func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) { + if len(raw) == 0 { + return raw, nil + } + + var doc map[string]any + if err := json.Unmarshal(raw, &doc); err != nil { + // Fallback to the original payload if it isn't a JSON object. + return raw, nil + } + + if t, ok := doc["type"]; ok { + if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" { + return raw, nil + } + } + + if _, ok := doc["properties"]; ok { + return raw, nil + } + + labels, ok := doc["labels"] + if !ok { + return raw, nil + } + + nsfw := false + + if items, ok := labels.([]any); ok && len(items) > 0 { + if first, ok := items[0].(map[string]any); ok { + if _, hasNSFW := first["nsfw"]; hasNSFW { + nsfw = true + } + if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence { + nsfw = true + } + } + } + + return openai.SchemaLabels(nsfw), nil +} + +// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields. +func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error { + if result == nil || len(payload) == 0 { + return nil + } + + var envelope struct { + Caption *struct { + Text string `json:"text"` + Confidence float32 `json:"confidence"` + } `json:"caption"` + Labels []LabelResult `json:"labels"` + } + + if err := json.Unmarshal(payload, &envelope); err != nil { + return err + } + + if envelope.Caption != nil { + text := strings.TrimSpace(envelope.Caption.Text) + if text != "" { + result.Caption = &CaptionResult{ + Text: text, + Confidence: envelope.Caption.Confidence, + Source: entity.SrcOpenAI, + } + } + } + + if len(envelope.Labels) > 0 { + filtered := envelope.Labels[:0] + + for i := range envelope.Labels { + if envelope.Labels[i].Source == "" { + envelope.Labels[i].Source = entity.SrcOpenAI + } + + normalizeLabelResult(&envelope.Labels[i]) + + if envelope.Labels[i].Name == "" { + continue + } + + filtered = append(filtered, envelope.Labels[i]) + } + + result.Labels = append(result.Labels, filtered...) + } + + return nil +} diff --git a/internal/ai/vision/engine_openai_test.go b/internal/ai/vision/engine_openai_test.go new file mode 100644 index 000000000..6fa163b3a --- /dev/null +++ b/internal/ai/vision/engine_openai_test.go @@ -0,0 +1,337 @@ +package vision + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/photoprism/photoprism/internal/ai/vision/openai" + "github.com/photoprism/photoprism/internal/ai/vision/schema" + "github.com/photoprism/photoprism/internal/entity" +) + +func TestOpenAIBuilderBuild(t *testing.T) { + model := &Model{ + Type: ModelTypeLabels, + Name: openai.DefaultModel, + Engine: openai.EngineName, + } + model.ApplyEngineDefaults() + + request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"}) + require.NoError(t, err) + require.NotNil(t, request) + + assert.Equal(t, ApiFormatOpenAI, request.ResponseFormat) + assert.NotEmpty(t, request.Images) + assert.NotNil(t, request.Options) + assert.Equal(t, openai.DefaultDetail, request.Options.Detail) + assert.True(t, request.Options.ForceJson) + assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.LabelsMaxTokens) +} + +func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) { + model := &Model{ + Type: ModelTypeCaption, + Name: openai.DefaultModel, + Engine: openai.EngineName, + Options: &ApiRequestOptions{ForceJson: true}, + } + model.ApplyEngineDefaults() + + request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"}) + require.NoError(t, err) + require.NotNil(t, request) + require.NotNil(t, request.Options) + assert.False(t, request.Options.ForceJson) + assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.CaptionMaxTokens) +} + +func TestApiRequestJSONForOpenAI(t *testing.T) { + req := &ApiRequest{ + Model: "gpt-5-mini", + System: "system", + Prompt: "describe the scene", + Images: []string{""}, + ResponseFormat: ApiFormatOpenAI, + Options: &ApiRequestOptions{ + Detail: openai.DefaultDetail, + MaxOutputTokens: 128, + Temperature: 0.2, + TopP: 0.8, + ForceJson: true, + }, + Schema: json.RawMessage(`{"type":"object","properties":{"caption":{"type":"object"}}}`), + } + + payload, err := req.JSON() + require.NoError(t, err) + + var decoded struct { + Model string `json:"model"` + Input []struct { + Role string `json:"role"` + Content []struct { + Type string `json:"type"` + } `json:"content"` + } `json:"input"` + Text struct { + Format struct { + Type string `json:"type"` + Name string `json:"name"` + Schema json.RawMessage `json:"schema"` + Strict bool `json:"strict"` + } `json:"format"` + } `json:"text"` + Reasoning struct { + Effort string `json:"effort"` + } `json:"reasoning"` + MaxOutputTokens int `json:"max_output_tokens"` + } + + require.NoError(t, json.Unmarshal(payload, &decoded)) + assert.Equal(t, "gpt-5-mini", decoded.Model) + require.Len(t, decoded.Input, 2) + assert.Equal(t, "system", decoded.Input[0].Role) + assert.Equal(t, openai.ResponseFormatJSONSchema, decoded.Text.Format.Type) + assert.Equal(t, schema.JsonSchemaName(decoded.Text.Format.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name) + assert.False(t, decoded.Text.Format.Strict) + assert.NotNil(t, decoded.Text.Format.Schema) + assert.Equal(t, "low", decoded.Reasoning.Effort) + assert.Equal(t, 128, decoded.MaxOutputTokens) +} + +func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) { + req := &ApiRequest{ + Model: "gpt-5-mini", + Images: []string{""}, + ResponseFormat: ApiFormatOpenAI, + Options: &ApiRequestOptions{ + Detail: openai.DefaultDetail, + MaxOutputTokens: 64, + ForceJson: true, + }, + Schema: json.RawMessage(`{"type":"object"}`), + } + + payload, err := req.JSON() + require.NoError(t, err) + + var decoded struct { + Text struct { + Format struct { + Name string `json:"name"` + } `json:"format"` + } `json:"text"` + } + + require.NoError(t, json.Unmarshal(payload, &decoded)) + assert.Equal(t, schema.JsonSchemaName(req.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name) +} + +func TestOpenAIParserParsesJSONFromTextPayload(t *testing.T) { + respPayload := `{ + "id": "resp_123", + "model": "gpt-5-mini", + "output": [{ + "role": "assistant", + "content": [{ + "type": "output_text", + "text": "{\"labels\":[{\"name\":\"deer\",\"confidence\":0.98,\"topicality\":0.99}]}" + }] + }] + }` + + req := &ApiRequest{ + Id: "test", + Model: "gpt-5-mini", + ResponseFormat: ApiFormatOpenAI, + } + + resp, err := openaiParser{}.Parse(context.Background(), req, []byte(respPayload), http.StatusOK) + require.NoError(t, err) + require.NotNil(t, resp) + require.Len(t, resp.Result.Labels, 1) + assert.Equal(t, "Deer", resp.Result.Labels[0].Name) + assert.Nil(t, resp.Result.Caption) +} + +func TestParseOpenAISchemaLegacyUpgrade(t *testing.T) { + legacy := `{ + "labels": [{ + "name": "", + "confidence": 0, + "topicality": 0 + }] + }` + + raw, err := parseOpenAISchema(legacy) + require.NoError(t, err) + + var decoded map[string]any + require.NoError(t, json.Unmarshal(raw, &decoded)) + + assert.Equal(t, "object", decoded["type"]) + + props, ok := decoded["properties"].(map[string]any) + require.True(t, ok) + labels, ok := props["labels"].(map[string]any) + require.True(t, ok) + assert.Equal(t, "array", labels["type"]) +} + +func TestParseOpenAISchemaLegacyUpgradeNSFW(t *testing.T) { + legacy := `{ + "labels": [{ + "name": "", + "confidence": 0, + "topicality": 0, + "nsfw": false, + "nsfw_confidence": 0 + }] + }` + + raw, err := parseOpenAISchema(legacy) + require.NoError(t, err) + + var decoded map[string]any + require.NoError(t, json.Unmarshal(raw, &decoded)) + + props := decoded["properties"].(map[string]any) + labels := props["labels"].(map[string]any) + items := labels["items"].(map[string]any) + _, hasNSFW := items["properties"].(map[string]any)["nsfw"] + assert.True(t, hasNSFW) +} + +func TestPerformApiRequestOpenAISuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var reqPayload struct { + Model string `json:"model"` + } + assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqPayload)) + assert.Equal(t, "gpt-5-mini", reqPayload.Model) + + response := map[string]any{ + "id": "resp_123", + "model": "gpt-5-mini", + "output": []any{ + map[string]any{ + "role": "assistant", + "content": []any{ + map[string]any{ + "type": "output_json", + "json": map[string]any{ + "caption": map[string]any{ + "text": "A cat rests on a windowsill.", + "confidence": 0.91, + }, + "labels": []map[string]any{ + { + "name": "cat", + "confidence": 0.92, + "topicality": 0.88, + }, + }, + }, + }, + }, + }, + }, + } + + assert.NoError(t, json.NewEncoder(w).Encode(response)) + })) + defer server.Close() + + req := &ApiRequest{ + Id: "test", + Model: "gpt-5-mini", + Images: []string{""}, + ResponseFormat: ApiFormatOpenAI, + Options: &ApiRequestOptions{ + Detail: openai.DefaultDetail, + }, + Schema: json.RawMessage(`{"type":"object"}`), + } + + resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "secret") + require.NoError(t, err) + require.NotNil(t, resp) + + require.NotNil(t, resp.Result.Caption) + assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source) + assert.Equal(t, "A cat rests on a windowsill.", resp.Result.Caption.Text) + + require.Len(t, resp.Result.Labels, 1) + assert.Equal(t, entity.SrcOpenAI, resp.Result.Labels[0].Source) + assert.Equal(t, "Cat", resp.Result.Labels[0].Name) +} + +func TestPerformApiRequestOpenAITextFallback(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + response := map[string]any{ + "id": "resp_456", + "model": "gpt-5-mini", + "output": []any{ + map[string]any{ + "role": "assistant", + "content": []any{ + map[string]any{ + "type": "output_text", + "text": "Two hikers reach the summit at sunset.", + }, + }, + }, + }, + } + assert.NoError(t, json.NewEncoder(w).Encode(response)) + })) + defer server.Close() + + req := &ApiRequest{ + Id: "fallback", + Model: "gpt-5-mini", + Images: []string{""}, + ResponseFormat: ApiFormatOpenAI, + Options: &ApiRequestOptions{ + Detail: openai.DefaultDetail, + }, + Schema: nil, + } + + resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "") + require.NoError(t, err) + require.NotNil(t, resp.Result.Caption) + assert.Equal(t, "Two hikers reach the summit at sunset.", resp.Result.Caption.Text) + assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source) +} + +func TestPerformApiRequestOpenAIError(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]any{ + "error": map[string]any{ + "message": "Invalid image payload", + }, + }) + })) + defer server.Close() + + req := &ApiRequest{ + Id: "error", + Model: "gpt-5-mini", + ResponseFormat: ApiFormatOpenAI, + Schema: nil, + Images: []string{""}, + } + + _, err := PerformApiRequest(req, server.URL, http.MethodPost, "") + require.Error(t, err) + assert.Contains(t, err.Error(), "Invalid image payload") +} diff --git a/internal/ai/vision/labels.go b/internal/ai/vision/labels.go index f80f90da8..a148bbfa1 100644 --- a/internal/ai/vision/labels.go +++ b/internal/ai/vision/labels.go @@ -96,8 +96,10 @@ func labelsInternal(images Files, mediaSrc media.Src, labelSrc entity.Src) (resu apiRequest.Prompt = prompt } - if options := model.GetOptions(); options != nil { - apiRequest.Options = options + if apiRequest.Options == nil { + if options := model.GetOptions(); options != nil { + apiRequest.Options = options + } } apiRequest.WriteLog() diff --git a/internal/ai/vision/model.go b/internal/ai/vision/model.go index 8054eb1cb..db2f3a275 100644 --- a/internal/ai/vision/model.go +++ b/internal/ai/vision/model.go @@ -348,6 +348,26 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) { if len(target.Stop) == 0 && len(defaults.Stop) > 0 { target.Stop = append([]string(nil), defaults.Stop...) } + + if target.MaxOutputTokens <= 0 && defaults.MaxOutputTokens > 0 { + target.MaxOutputTokens = defaults.MaxOutputTokens + } + + if strings.TrimSpace(target.Detail) == "" && strings.TrimSpace(defaults.Detail) != "" { + target.Detail = strings.TrimSpace(defaults.Detail) + } + + if !target.ForceJson && defaults.ForceJson { + target.ForceJson = true + } + + if target.SchemaVersion == "" && defaults.SchemaVersion != "" { + target.SchemaVersion = defaults.SchemaVersion + } + + if target.CombineOutputs == "" && defaults.CombineOutputs != "" { + target.CombineOutputs = defaults.CombineOutputs + } } func normalizeOptions(opts *ApiRequestOptions) { @@ -422,6 +442,10 @@ func (m *Model) ApplyEngineDefaults() { } if info, ok := EngineInfoFor(engine); ok { + if m.Service.Uri == "" { + m.Service.Uri = info.Uri + } + if m.Service.RequestFormat == "" { m.Service.RequestFormat = info.RequestFormat } @@ -490,7 +514,7 @@ func (m *Model) SchemaTemplate() string { } if m.schema == "" { - m.schema = visionschema.Labels(m.PromptContains("nsfw")) + m.schema = visionschema.LabelsJson(m.PromptContains("nsfw")) } } }) diff --git a/internal/ai/vision/ollama/defaults.go b/internal/ai/vision/ollama/defaults.go index 145e710eb..64530def9 100644 --- a/internal/ai/vision/ollama/defaults.go +++ b/internal/ai/vision/ollama/defaults.go @@ -1,7 +1,5 @@ package ollama -import "github.com/photoprism/photoprism/internal/ai/vision/schema" - const ( // CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence. CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words." @@ -22,12 +20,3 @@ const ( // DefaultResolution is the default thumbnail size submitted to Ollama models. DefaultResolution = 720 ) - -// LabelsSchema returns the canonical label schema string consumed by Ollama models. -func LabelsSchema(nsfw bool) string { - if nsfw { - return schema.LabelsNSFW - } else { - return schema.LabelsDefault - } -} diff --git a/internal/ai/vision/ollama/schema.go b/internal/ai/vision/ollama/schema.go new file mode 100644 index 000000000..1bbcd857b --- /dev/null +++ b/internal/ai/vision/ollama/schema.go @@ -0,0 +1,14 @@ +package ollama + +import ( + "github.com/photoprism/photoprism/internal/ai/vision/schema" +) + +// SchemaLabels returns the canonical label schema string consumed by Ollama models. +// +// Related documentation and references: +// - https://www.alibabacloud.com/help/en/model-studio/json-mode +// - https://www.json.org/json-en.html +func SchemaLabels(nsfw bool) string { + return schema.LabelsJson(nsfw) +} diff --git a/internal/ai/vision/ollama/transport.go b/internal/ai/vision/ollama/transport.go new file mode 100644 index 000000000..bf0be34ab --- /dev/null +++ b/internal/ai/vision/ollama/transport.go @@ -0,0 +1,79 @@ +package ollama + +import ( + "errors" + "fmt" + "time" +) + +// Response encapsulates the subset of the Ollama generate API response we care about. +type Response struct { + ID string `yaml:"Id,omitempty" json:"id,omitempty"` + Code int `yaml:"Code,omitempty" json:"code,omitempty"` + Error string `yaml:"Error,omitempty" json:"error,omitempty"` + Model string `yaml:"Model,omitempty" json:"model,omitempty"` + CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"` + Response string `yaml:"Response,omitempty" json:"response,omitempty"` + Done bool `yaml:"Done,omitempty" json:"done,omitempty"` + Context []int `yaml:"Context,omitempty" json:"context,omitempty"` + TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"` + LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"` + PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"` + PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"` + EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"` + EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"` + Result ResultPayload `yaml:"Result,omitempty" json:"result,omitempty"` +} + +// Err returns an error if the request has failed. +func (r *Response) Err() error { + if r == nil { + return errors.New("response is nil") + } + + if r.Code >= 400 { + if r.Error != "" { + return errors.New(r.Error) + } + + return fmt.Errorf("error %d", r.Code) + } else if len(r.Result.Labels) == 0 && r.Result.Caption == nil { + return errors.New("no result") + } + + return nil +} + +// HasResult checks if there is at least one result in the response data. +func (r *Response) HasResult() bool { + if r == nil { + return false + } + + return len(r.Result.Labels) > 0 || r.Result.Caption != nil +} + +// ResultPayload mirrors the structure returned by Ollama for result data. +type ResultPayload struct { + Labels []LabelPayload `json:"labels"` + Caption *CaptionPayload `json:"caption,omitempty"` +} + +// LabelPayload represents a single label object emitted by the Ollama adapter. +type LabelPayload struct { + Name string `json:"name"` + Source string `json:"source,omitempty"` + Priority int `json:"priority,omitempty"` + Confidence float32 `json:"confidence,omitempty"` + Topicality float32 `json:"topicality,omitempty"` + Categories []string `json:"categories,omitempty"` + NSFW bool `json:"nsfw,omitempty"` + NSFWConfidence float32 `json:"nsfw_confidence,omitempty"` +} + +// CaptionPayload represents the caption object emitted by the Ollama adapter. +type CaptionPayload struct { + Text string `json:"text"` + Source string `json:"source,omitempty"` + Confidence float32 `json:"confidence,omitempty"` +} diff --git a/internal/ai/vision/openai/README.md b/internal/ai/vision/openai/README.md new file mode 100644 index 000000000..f6d75bc38 --- /dev/null +++ b/internal/ai/vision/openai/README.md @@ -0,0 +1,128 @@ +## PhotoPrism — OpenAI API Integration + +**Last Updated:** November 14, 2025 + +### Overview + +This package contains PhotoPrism’s adapter for the OpenAI Responses API. It enables existing caption and label workflows (`GenerateCaption`, `GenerateLabels`, and the `photoprism vision run` CLI) to call OpenAI models alongside TensorFlow and Ollama without changing worker or API code. The implementation focuses on predictable results, structured outputs, and clear observability so operators can opt in gradually. + +#### Context & Constraints + +- OpenAI requests flow through the existing vision client (`internal/ai/vision/api_client.go`) and must honour PhotoPrism’s timeout, logging, and ACL rules. +- Structured outputs are preferred but the adapter must gracefully handle free-form text; `output_text` responses are parsed both as JSON and as plain captions. +- Costs should remain predictable: requests are limited to a single 720 px thumbnail (`detail=low`) and capped token budgets (512 caption, 1024 labels). +- Secrets are supplied per model (`Service.Key`) with fallbacks to `OPENAI_API_KEY` / `_FILE`. Logs must redact sensitive data. + +#### Goals + +- Provide drop-in OpenAI support for captions and labels using `vision.yml`. +- Keep configuration ergonomic by auto-populating prompts, schema names, token limits, and sampling defaults. +- Expose enough logging and tests so operators can compare OpenAI output with existing engines before enabling it broadly. + +#### Non-Goals + +- Introducing a new `generate` model type or combined caption/label endpoint (reserved for a later phase). +- Replacing the default TensorFlow models; they remain active as fallbacks. +- Managing OpenAI billing or quota dashboards beyond surfacing token counts in logs and metrics. + +### Prompt, Model, & Schema Guidance + +- **Models:** The adapter targets GPT‑5 vision tiers (e.g. `gpt-5-nano`, `gpt-5-mini`). These models support image inputs, structured outputs, and deterministic settings. Set `Name` to the exact provider identifier so defaults are applied correctly. Caption models share the same configuration surface and run through the same adapter. +- **Prompts:** Defaults live in `defaults.go`. Captions use a single-sentence instruction; labels use `LabelPromptDefault` (or `LabelPromptNSFW` when PhotoPrism requests NSFW metadata). Custom prompts should retain schema reminders so structured outputs stay valid. +- **Schemas:** Labels use the JSON schema returned by `schema.LabelsJsonSchema(nsfw)`; the response format name is derived via `schema.JsonSchemaName` (e.g. `photoprism_vision_labels_v1`). Captions omit schemas unless operators explicitly request a structured format. +- **When to keep defaults:** For most deployments, leaving `System`, `Prompt`, `Schema`, and `Options` unset yields stable output with minimal configuration. Override them only when domain-specific language or custom scoring is necessary, and add regression tests alongside. + +Budget-conscious operators can experiment with lighter prompts or lower-resolution thumbnails, but should keep token limits and determinism settings intact to avoid unexpected bills and UI churn. + +#### Performance & Cost Estimates + +- **Token budgets:** Captions request up to 512 output tokens; labels request up to 1024. Input tokens are typically ≤700 for a single 720 px thumbnail plus prompts. +- **Latency:** GPT‑5 nano/mini vision calls typically complete in 3–8 s, depending on OpenAI region. Including reasoning metadata (`reasoning.effort=low`) has negligible impact but improves traceability. +- **Costs:** Consult OpenAI’s pricing for the selected model. Multiply input/output tokens by the published rate. PhotoPrism currently sends one image per request to keep costs linear with photo count. + +#### Defaults + +- File scheme: `data:` URLs (base64) for all OpenAI models. +- Resolution: 720 px thumbnails (`vision.Thumb(ModelTypeCaption|Labels)`). +- Options: `MaxOutputTokens` raised to 512 (caption) / 1024 (labels); `ForceJson=false` for captions, `true` for labels; `reasoning.effort="low"`. +- Sampling: `Temperature` and `TopP` set to `0` for `gpt-5*` models; inherited values (0.1/0.9) remain for other engines. `openaiBuilder.Build` performs this override while preserving the struct defaults for non-OpenAI adapters. +- Schema naming: Automatically derived via `schema.JsonSchemaName`, so operators may omit `SchemaVersion`. + +### Configuration + +#### Environment Variables + +- `OPENAI_API_KEY` / `OPENAI_API_KEY_FILE` — fallback credentials when a model’s `Service.Key` is unset. +- Existing `PHOTOPRISM_VISION_*` variables remain authoritative (see the [Developer Guide](https://docs.photoprism.app/developer-guide/vision/service/) for full lists). + +#### `vision.yml` Examples + +```yaml +Models: + - Type: caption + Name: gpt-5-nano + Engine: openai + Disabled: false # opt in manually + Resolution: 720 # optional; default is 720 + Options: + Detail: low # optional; defaults to low + MaxOutputTokens: 512 + Service: + Uri: https://api.openai.com/v1/responses + FileScheme: data + Key: ${OPENAI_API_KEY} + + - Type: labels + Name: gpt-5-mini + Engine: openai + Disabled: false + Resolution: 720 + Options: + Detail: low + MaxOutputTokens: 1024 + ForceJson: true # redundant but explicit + Service: + Uri: https://api.openai.com/v1/responses + FileScheme: data + Key: ${OPENAI_API_KEY} +``` + +Keep TensorFlow entries in place so PhotoPrism falls back when the external service is unavailable. + +### Documentation + +- Label Generation: +- Caption Generation: +- Vision CLI Commands: + +### Implementation Details + +#### Core Concepts + +- **Structured outputs:** PhotoPrism leverages OpenAI’s structured output capability as documented at . When a JSON schema is supplied, the adapter emits `text.format` with `type: "json_schema"` and a schema name derived from the content. The parser then prefers `output_json`, but also attempts to decode `output_text` payloads that contain JSON objects. +- **Deterministic sampling:** GPT‑5 models are run with `temperature=0` and `top_p=0` to minimise variance, while still allowing developers to override values in `vision.yml` if needed. +- **Reasoning metadata:** Requests include `reasoning.effort="low"` so OpenAI returns structured reasoning usage counters, helping operators track token consumption. +- **Worker summaries:** The vision worker now logs either “updated …” or “processed … (no metadata changes detected)”, making reruns easy to audit. + +#### Rate Limiting + +OpenAI calls respect the existing `limiter.Auth` configuration used by the vision service. Failed requests surface standard HTTP errors and are not automatically retried; operators should ensure they have adequate account limits and consider external rate limiting when sharing credentials. + +#### Testing & Validation + +1. Unit tests: `go test ./internal/ai/vision/openai ./internal/ai/vision -run OpenAI -count=1`. Fixtures under `internal/ai/vision/openai/testdata/` replay real Responses payloads (captions and labels). +2. CLI smoke test: `photoprism vision run -m labels --count 1 --force --model=gpt-5-mini` with trace logging enabled to inspect sanitised Responses. +3. Compare worker summaries and label sources (`openai`) in the UI or via `photoprism vision ls`. + +#### Code Map + +- **Adapter & defaults:** `internal/ai/vision/openai` (defaults, schema helpers, transport, tests). +- **Request/response plumbing:** `internal/ai/vision/api_request.go`, `api_client.go`, `engine_openai.go`, `engine_openai_test.go`. +- **Workers & CLI:** `internal/workers/vision.go`, `internal/commands/vision_run.go`. +- **Shared utilities:** `internal/ai/vision/schema`, `pkg/clean`, `pkg/media`. + +#### Next Steps + +- [ ] Introduce the future `generate` model type that combines captions, labels, and optional markers. +- [ ] Evaluate additional OpenAI models as pricing and capabilities evolve. +- [ ] Expose token usage metrics (input/output/reasoning) via Prometheus once the schema stabilises. diff --git a/internal/ai/vision/openai/defaults.go b/internal/ai/vision/openai/defaults.go index b29b44bea..36f9977dd 100644 --- a/internal/ai/vision/openai/defaults.go +++ b/internal/ai/vision/openai/defaults.go @@ -1,6 +1,29 @@ package openai -import "github.com/photoprism/photoprism/internal/ai/vision/schema" +const ( + // CaptionSystem defines the default system prompt for caption models. + CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately." + // CaptionPrompt instructs caption models to respond with a single sentence. + CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon." + // LabelSystem defines the system prompt for label generation. + LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns." + // LabelPromptDefault requests general-purpose labels. + LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)." + // LabelPromptNSFW requests labels including NSFW metadata when required. + LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)." + // DefaultDetail specifies the preferred thumbnail detail level for Requests API calls. + DefaultDetail = "low" + // CaptionMaxTokens suggests the output budget for caption responses. + CaptionMaxTokens = 512 + // LabelsMaxTokens suggests the output budget for label responses. + LabelsMaxTokens = 1024 + // DefaultTemperature configures deterministic replies. + DefaultTemperature = 0.1 + // DefaultTopP limits nucleus sampling. + DefaultTopP = 0.9 + // DefaultSchemaVersion is used when callers do not specify an explicit schema version. + DefaultSchemaVersion = "v1" +) var ( // DefaultModel is the model used by default when accessing the OpenAI API. @@ -8,8 +31,3 @@ var ( // DefaultResolution is the default thumbnail size submitted to the OpenAI. DefaultResolution = 720 ) - -// LabelsSchema returns the canonical label schema string consumed by OpenAI models. -func LabelsSchema() string { - return schema.LabelsDefault -} diff --git a/internal/ai/vision/openai/schema.go b/internal/ai/vision/openai/schema.go new file mode 100644 index 000000000..0d37aeb84 --- /dev/null +++ b/internal/ai/vision/openai/schema.go @@ -0,0 +1,16 @@ +package openai + +import ( + "encoding/json" + + "github.com/photoprism/photoprism/internal/ai/vision/schema" +) + +// SchemaLabels returns the canonical labels JSON Schema string consumed by Ollama models. +// +// Related documentation and references: +// - https://platform.openai.com/docs/guides/structured-outputs +// - https://json-schema.org/learn/miscellaneous-examples +func SchemaLabels(nsfw bool) json.RawMessage { + return schema.LabelsJsonSchema(nsfw) +} diff --git a/internal/ai/vision/openai/testdata/caption-response.json b/internal/ai/vision/openai/testdata/caption-response.json new file mode 100644 index 000000000..e77eac59c --- /dev/null +++ b/internal/ai/vision/openai/testdata/caption-response.json @@ -0,0 +1,73 @@ +{ + "id": "resp_0d356718505119f3006916e5d8730881a0b91de2aa700f6196", + "object": "response", + "created_at": 1763108312, + "status": "completed", + "background": false, + "billing": { + "payer": "developer" + }, + "error": null, + "incomplete_details": null, + "instructions": null, + "max_output_tokens": 512, + "max_tool_calls": null, + "model": "gpt-5-nano-2025-08-07", + "output": [ + { + "id": "rs_0d356718505119f3006916e5d8efd481a0a4f9cc1823cc6c83", + "type": "reasoning", + "summary": [] + }, + { + "id": "msg_0d356718505119f3006916e5d9433881a0bc79197d2cfc2027", + "type": "message", + "status": "completed", + "content": [ + { + "type": "output_text", + "annotations": [], + "logprobs": [], + "text": "A bee gathers nectar from the vibrant red poppy\u2019s center." + } + ], + "role": "assistant" + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "prompt_cache_retention": null, + "reasoning": { + "effort": "low", + "summary": null + }, + "safety_identifier": null, + "service_tier": "default", + "store": true, + "temperature": 1.0, + "text": { + "format": { + "type": "text" + }, + "verbosity": "medium" + }, + "tool_choice": "auto", + "tools": [], + "top_logprobs": 0, + "top_p": 1.0, + "truncation": "disabled", + "usage": { + "input_tokens": 576, + "input_tokens_details": { + "cached_tokens": 0 + }, + "output_tokens": 19, + "output_tokens_details": { + "reasoning_tokens": 0 + }, + "total_tokens": 595 + }, + "user": null, + "metadata": {} +} diff --git a/internal/ai/vision/openai/testdata/labels-response.json b/internal/ai/vision/openai/testdata/labels-response.json new file mode 100644 index 000000000..c1cc3deb1 --- /dev/null +++ b/internal/ai/vision/openai/testdata/labels-response.json @@ -0,0 +1,114 @@ +{ + "id": "resp_0fa91dfb69b7d644006916ea0b72ac819f84ff3152a38dfcdb", + "object": "response", + "created_at": 1763109387, + "status": "completed", + "background": false, + "billing": { + "payer": "developer" + }, + "error": null, + "incomplete_details": null, + "instructions": null, + "max_output_tokens": 1024, + "max_tool_calls": null, + "model": "gpt-5-mini-2025-08-07", + "output": [ + { + "id": "rs_0fa91dfb69b7d644006916ea0c3450819f8a13396bf377f474", + "type": "reasoning", + "summary": [] + }, + { + "id": "msg_0fa91dfb69b7d644006916ea0d2dfc819faf52b11334fc10a4", + "type": "message", + "status": "completed", + "content": [ + { + "type": "output_text", + "annotations": [], + "logprobs": [], + "text": "{\"labels\":[{\"name\":\"flower\",\"confidence\":0.99,\"topicality\":0.99},{\"name\":\"bee\",\"confidence\":0.95,\"topicality\":0.95},{\"name\":\"petal\",\"confidence\":0.92,\"topicality\":0.88},{\"name\":\"pollen\",\"confidence\":0.85,\"topicality\":0.8},{\"name\":\"insect\",\"confidence\":0.9,\"topicality\":0.85},{\"name\":\"red\",\"confidence\":0.88,\"topicality\":0.6},{\"name\":\"close-up\",\"confidence\":0.86,\"topicality\":0.7},{\"name\":\"nature\",\"confidence\":0.8,\"topicality\":0.5}]}" + } + ], + "role": "assistant" + } + ], + "parallel_tool_calls": true, + "previous_response_id": null, + "prompt_cache_key": null, + "prompt_cache_retention": null, + "reasoning": { + "effort": "low", + "summary": null + }, + "safety_identifier": null, + "service_tier": "default", + "store": true, + "temperature": 1.0, + "text": { + "format": { + "type": "json_schema", + "description": null, + "name": "photoprism_vision_labels_v1", + "schema": { + "type": "object", + "properties": { + "labels": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "topicality": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "name", + "confidence", + "topicality" + ], + "additionalProperties": false + }, + "default": [] + } + }, + "required": [ + "labels" + ], + "additionalProperties": false + }, + "strict": true + }, + "verbosity": "medium" + }, + "tool_choice": "auto", + "tools": [], + "top_logprobs": 0, + "top_p": 1.0, + "truncation": "disabled", + "usage": { + "input_tokens": 724, + "input_tokens_details": { + "cached_tokens": 0 + }, + "output_tokens": 169, + "output_tokens_details": { + "reasoning_tokens": 0 + }, + "total_tokens": 893 + }, + "user": null, + "metadata": {} +} diff --git a/internal/ai/vision/openai/transport.go b/internal/ai/vision/openai/transport.go new file mode 100644 index 000000000..ee061e149 --- /dev/null +++ b/internal/ai/vision/openai/transport.go @@ -0,0 +1,142 @@ +package openai + +import ( + "encoding/json" + "strings" +) + +const ( + // ContentTypeText identifies text input segments for the Responses API. + ContentTypeText = "input_text" + // ContentTypeImage identifies image input segments for the Responses API. + ContentTypeImage = "input_image" + + // ResponseFormatJSONSchema requests JSON constrained by a schema. + ResponseFormatJSONSchema = "json_schema" + // ResponseFormatJSONObject requests a free-form JSON object. + ResponseFormatJSONObject = "json_object" +) + +// HTTPRequest represents the payload expected by OpenAI's Responses API. +type HTTPRequest struct { + Model string `json:"model"` + Input []InputMessage `json:"input"` + Text *TextOptions `json:"text,omitempty"` + Reasoning *Reasoning `json:"reasoning,omitempty"` + MaxOutputTokens int `json:"max_output_tokens,omitempty"` + Temperature float64 `json:"temperature,omitempty"` + TopP float64 `json:"top_p,omitempty"` + PresencePenalty float64 `json:"presence_penalty,omitempty"` + FrequencyPenalty float64 `json:"frequency_penalty,omitempty"` +} + +// TextOptions carries formatting preferences for textual responses. +type TextOptions struct { + Format *ResponseFormat `json:"format,omitempty"` +} + +// Reasoning configures the effort level for reasoning models. +type Reasoning struct { + Effort string `json:"effort,omitempty"` +} + +// InputMessage captures a single system or user message in the request. +type InputMessage struct { + Role string `json:"role"` + Type string `json:"type,omitempty"` + Content []ContentItem `json:"content"` +} + +// ContentItem represents a text or image entry within a message. +type ContentItem struct { + Type string `json:"type"` + Text string `json:"text,omitempty"` + ImageURL string `json:"image_url,omitempty"` + Detail string `json:"detail,omitempty"` +} + +// ResponseFormat describes how OpenAI should format its response. +type ResponseFormat struct { + Type string `json:"type"` + Name string `json:"name,omitempty"` + Schema json.RawMessage `json:"schema,omitempty"` + Description string `json:"description,omitempty"` + Strict bool `json:"strict,omitempty"` +} + +// Response mirrors the subset of the Responses API response we need. +type Response struct { + ID string `json:"id"` + Model string `json:"model"` + Output []ResponseOutput `json:"output"` + Error *struct { + Message string `json:"message"` + Type string `json:"type"` + } `json:"error,omitempty"` +} + +// ResponseOutput captures assistant messages within the response. +type ResponseOutput struct { + Role string `json:"role"` + Content []ResponseContent `json:"content"` +} + +// ResponseContent contains individual message parts (JSON or text). +type ResponseContent struct { + Type string `json:"type"` + Text string `json:"text,omitempty"` + JSON json.RawMessage `json:"json,omitempty"` +} + +// FirstJSON returns the first JSON payload contained in the response. +func (r *Response) FirstJSON() json.RawMessage { + if r == nil { + return nil + } + + for i := range r.Output { + for j := range r.Output[i].Content { + if len(r.Output[i].Content[j].JSON) > 0 { + return r.Output[i].Content[j].JSON + } + } + } + + return nil +} + +// FirstText returns the first textual payload contained in the response. +func (r *Response) FirstText() string { + if r == nil { + return "" + } + + for i := range r.Output { + for j := range r.Output[i].Content { + if text := strings.TrimSpace(r.Output[i].Content[j].Text); text != "" { + return text + } + } + } + + return "" +} + +// ParseErrorMessage extracts a human readable error message from a Responses API payload. +func ParseErrorMessage(raw []byte) string { + var errResp struct { + Error *struct { + Message string `json:"message"` + } `json:"error"` + } + + if err := json.Unmarshal(raw, &errResp); err != nil { + return "" + } + + if errResp.Error != nil { + return strings.TrimSpace(errResp.Error.Message) + } + + return "" +} diff --git a/internal/ai/vision/openai/transport_test.go b/internal/ai/vision/openai/transport_test.go new file mode 100644 index 000000000..6141ea4f6 --- /dev/null +++ b/internal/ai/vision/openai/transport_test.go @@ -0,0 +1,120 @@ +package openai + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" +) + +func loadTestResponse(t *testing.T, name string) *Response { + t.Helper() + + filePath := filepath.Join("testdata", name) + + data, err := os.ReadFile(filePath) + if err != nil { + t.Fatalf("failed to read %s: %v", filePath, err) + } + + var resp Response + if err := json.Unmarshal(data, &resp); err != nil { + t.Fatalf("failed to unmarshal %s: %v", filePath, err) + } + + return &resp +} + +func TestParseErrorMessage(t *testing.T) { + t.Run("returns message when present", func(t *testing.T) { + raw := []byte(`{"error":{"message":"Invalid schema"}}`) + msg := ParseErrorMessage(raw) + if msg != "Invalid schema" { + t.Fatalf("expected message, got %q", msg) + } + }) + + t.Run("returns empty string when error is missing", func(t *testing.T) { + raw := []byte(`{"output":[]}`) + if msg := ParseErrorMessage(raw); msg != "" { + t.Fatalf("expected empty message, got %q", msg) + } + }) +} + +func TestResponseFirstTextCaption(t *testing.T) { + resp := loadTestResponse(t, "caption-response.json") + + if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 { + t.Fatalf("expected no JSON payload, got: %s", jsonPayload) + } + + text := resp.FirstText() + expected := "A bee gathers nectar from the vibrant red poppy’s center." + if text != expected { + t.Fatalf("unexpected caption text: %q", text) + } +} + +func TestResponseFirstTextLabels(t *testing.T) { + resp := loadTestResponse(t, "labels-response.json") + + if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 { + t.Fatalf("expected no JSON payload, got: %s", jsonPayload) + } + + text := resp.FirstText() + if len(text) == 0 { + t.Fatal("expected structured JSON string in text payload") + } + if text[0] != '{' { + t.Fatalf("expected JSON object in text payload, got %q", text) + } +} + +func TestResponseFirstJSONFromStructuredPayload(t *testing.T) { + resp := &Response{ + ID: "resp_structured", + Model: "gpt-5-mini", + Output: []ResponseOutput{ + { + Role: "assistant", + Content: []ResponseContent{ + { + Type: "output_json", + JSON: json.RawMessage(`{"labels":[{"name":"sunset"}]}`), + }, + }, + }, + }, + } + + jsonPayload := resp.FirstJSON() + if len(jsonPayload) == 0 { + t.Fatal("expected JSON payload, got empty result") + } + + var decoded struct { + Labels []map[string]string `json:"labels"` + } + if err := json.Unmarshal(jsonPayload, &decoded); err != nil { + t.Fatalf("failed to decode JSON payload: %v", err) + } + + if len(decoded.Labels) != 1 || decoded.Labels[0]["name"] != "sunset" { + t.Fatalf("unexpected JSON payload: %+v", decoded.Labels) + } +} + +func TestSchemaLabelsReturnsValidJSON(t *testing.T) { + raw := SchemaLabels(false) + + var decoded map[string]any + if err := json.Unmarshal(raw, &decoded); err != nil { + t.Fatalf("schema should be valid JSON: %v", err) + } + + if decoded["type"] != "object" { + t.Fatalf("expected type object, got %v", decoded["type"]) + } +} diff --git a/internal/ai/vision/schema/labels.go b/internal/ai/vision/schema/labels.go index 735a70cb9..6ecd26afa 100644 --- a/internal/ai/vision/schema/labels.go +++ b/internal/ai/vision/schema/labels.go @@ -1,16 +1,115 @@ package schema -// LabelsDefault provides the minimal JSON schema for label responses used across engines. -const ( - LabelsDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}" - LabelsNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}" +import ( + "encoding/json" ) -// Labels returns the canonical label schema string. -func Labels(nsfw bool) string { +// LabelsJsonSchemaDefault provides the minimal JSON schema for label responses used across engines. +const ( + LabelsJsonSchemaDefault = `{ + "type": "object", + "properties": { + "labels": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "topicality": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": ["name", "confidence", "topicality"], + "additionalProperties": false + }, + "default": [] + } + }, + "required": ["labels"], + "additionalProperties": false +}` + LabelsJsonDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}" + LabelsJsonSchemaNSFW = `{ + "type": "object", + "properties": { + "labels": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "topicality": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "nsfw": { + "type": "boolean" + }, + "nsfw_confidence": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "name", + "confidence", + "topicality", + "nsfw", + "nsfw_confidence" + ], + "additionalProperties": false + }, + "default": [] + } + }, + "required": ["labels"], + "additionalProperties": false +}` + LabelsJsonNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}" +) + +// LabelsJsonSchema returns the canonical label JSON Schema string for OpenAI API endpoints. +// +// Related documentation and references: +// - https://platform.openai.com/docs/guides/structured-outputs +// - https://json-schema.org/learn/miscellaneous-examples +func LabelsJsonSchema(nsfw bool) json.RawMessage { if nsfw { - return LabelsNSFW + return json.RawMessage(LabelsJsonSchemaNSFW) } else { - return LabelsDefault + return json.RawMessage(LabelsJsonSchemaDefault) + } +} + +// LabelsJson returns the canonical label JSON string for Ollama vision models. +// +// Related documentation and references: +// - https://www.alibabacloud.com/help/en/model-studio/json-mode +// - https://www.json.org/json-en.html +func LabelsJson(nsfw bool) string { + if nsfw { + return LabelsJsonNSFW + } else { + return LabelsJsonDefault } } diff --git a/internal/ai/vision/schema/name.go b/internal/ai/vision/schema/name.go new file mode 100644 index 000000000..2f642f2f3 --- /dev/null +++ b/internal/ai/vision/schema/name.go @@ -0,0 +1,36 @@ +package schema + +import ( + "bytes" + "encoding/json" + "fmt" + + "github.com/photoprism/photoprism/pkg/clean" +) + +const ( + NamePrefix = "photoprism_vision" +) + +// JsonSchemaName returns the schema version string to be used for API requests. +func JsonSchemaName(schema json.RawMessage, version string) string { + var schemaName string + + switch { + case bytes.Contains(schema, []byte("labels")): + schemaName = "labels" + case bytes.Contains(schema, []byte("labels")): + schemaName = "caption" + default: + schemaName = "schema" + } + + version = clean.TypeLowerUnderscore(version) + + if version == "" { + version = "v1" + } + + return fmt.Sprintf("%s_%s_%s", NamePrefix, schemaName, version) + +} diff --git a/internal/ai/vision/schema/name_test.go b/internal/ai/vision/schema/name_test.go new file mode 100644 index 000000000..c2d0897f7 --- /dev/null +++ b/internal/ai/vision/schema/name_test.go @@ -0,0 +1,23 @@ +package schema + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestJsonSchemaName(t *testing.T) { + t.Run("Default", func(t *testing.T) { + assert.Equal(t, "photoprism_vision_schema_v1", JsonSchemaName(nil, "")) + }) + t.Run("Labels", func(t *testing.T) { + assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(json.RawMessage(LabelsJsonSchemaDefault), "")) + }) + t.Run("LabelsV1", func(t *testing.T) { + assert.Equal(t, "photoprism_vision_labels_v2", JsonSchemaName([]byte("labels"), "v2")) + }) + t.Run("LabelsJsonSchema", func(t *testing.T) { + assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(LabelsJsonSchema(false), "v1")) + }) +} diff --git a/internal/ai/vision/schema/schema.go b/internal/ai/vision/schema/schema.go index 87a1dc6fc..4801a8477 100644 --- a/internal/ai/vision/schema/schema.go +++ b/internal/ai/vision/schema/schema.go @@ -1,5 +1,5 @@ /* -Package schema defines canonical JSON schema templates shared by PhotoPrism's AI vision engines. +Package schema defines canonical JSON and JSON Schema templates shared by PhotoPrism's AI vision engines. Copyright (c) 2018 - 2025 PhotoPrism UG. All rights reserved. diff --git a/internal/config/feat/vision.go b/internal/config/feat/vision.go index ae415a829..16d95045b 100644 --- a/internal/config/feat/vision.go +++ b/internal/config/feat/vision.go @@ -4,5 +4,5 @@ package feat var ( VisionModelGenerate = false // controls exposure of the generate endpoint and CLI commands VisionModelMarkers = false // gates marker generation/return until downstream UI and reconciliation paths are ready - VisionServiceOpenAI = false // controls whether users are able to configure OpenAI as a vision service engine + VisionServiceOpenAI = true // controls whether users are able to configure OpenAI as a vision service engine ) diff --git a/internal/workers/vision.go b/internal/workers/vision.go index 4d85b6dbd..597432d9a 100644 --- a/internal/workers/vision.go +++ b/internal/workers/vision.go @@ -135,6 +135,7 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri done := make(map[string]bool) offset := 0 updated := 0 + processed := 0 // Make sure count is within if count < 1 || count > search.MaxResults { @@ -197,6 +198,8 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri continue } + processed++ + fileName := photoprism.FileName(photo.FileRoot, photo.FileName) file, fileErr := photoprism.NewMediaFile(fileName) @@ -279,7 +282,18 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri } } - log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), time.Since(start)) + elapsed := time.Since(start) + + switch { + case processed == 0: + log.Infof("vision: no pictures required processing [%s]", elapsed) + case updated == processed: + log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), elapsed) + case updated == 0: + log.Infof("vision: processed %s (no metadata changes detected) [%s]", english.Plural(processed, "picture", "pictures"), elapsed) + default: + log.Infof("vision: updated %s out of %s [%s]", english.Plural(updated, "picture", "pictures"), english.Plural(processed, "picture", "pictures"), elapsed) + } if updated > 0 { updateIndex = true