Merge branch 'develop' into feature/batch-edit

This commit is contained in:
Michael Mayer 2025-11-15 15:42:59 +01:00
commit d2541e674a
59 changed files with 2571 additions and 216 deletions

View file

@ -1,6 +1,6 @@
# PhotoPrism® Repository Guidelines
**Last Updated:** November 12, 2025
**Last Updated:** November 14, 2025
## Purpose
@ -17,6 +17,7 @@ Learn more: https://agents.md/
- REST API: https://docs.photoprism.dev/ (Swagger), https://docs.photoprism.app/developer-guide/api/ (Docs)
- Code Maps: [`CODEMAP.md`](CODEMAP.md) (Backend/Go), [`frontend/CODEMAP.md`](frontend/CODEMAP.md) (Frontend/JS)
- Face Detection & Embeddings Notes: [`internal/ai/face/README.md`](internal/ai/face/README.md)
- Vision Engine Guides: [`internal/ai/vision/openai/README.md`](internal/ai/vision/openai/README.md), [`internal/ai/vision/ollama/README.md`](internal/ai/vision/ollama/README.md)
> Quick Tip: to inspect GitHub issue details without leaving the terminal, run `curl -s https://api.github.com/repos/photoprism/photoprism/issues/<id>`.

View file

@ -1,6 +1,6 @@
PhotoPrism — Backend CODEMAP
**Last Updated:** November 2, 2025
**Last Updated:** November 14, 2025
Purpose
- Give agents and contributors a fast, reliable map of where things live and how they fit together, so you can add features, fix bugs, and write tests without spelunking.
@ -35,6 +35,7 @@ High-Level Package Map (Go)
- `internal/config` — configuration, flags/env/options, client config, DB init/migrate
- `internal/entity` — GORM v1 models, queries, search helpers, migrations
- `internal/photoprism` — core domain logic (indexing, import, faces, thumbnails, cleanup)
- `internal/ai/vision` — multi-engine computer vision pipeline (models, adapters, schema). Adapter docs: [`internal/ai/vision/openai/README.md`](internal/ai/vision/openai/README.md) and [`internal/ai/vision/ollama/README.md`](internal/ai/vision/ollama/README.md).
- `internal/workers` — background schedulers (index, vision, sync, meta, backup)
- `internal/auth` — ACL, sessions, OIDC
- `internal/service` — cluster/portal, maps, hub, webdav

View file

@ -1,5 +1,5 @@
# Ubuntu 25.10 (Questing Quokka)
FROM photoprism/develop:251018-questing
FROM photoprism/develop:251113-questing
# Harden npm usage by default (applies to npm ci / install in dev container)
ENV NPM_CONFIG_IGNORE_SCRIPTS=true

View file

@ -388,7 +388,8 @@ services:
## Login with "user / photoprism" and "admin / photoprism".
keycloak:
image: quay.io/keycloak/keycloak:25.0
stop_grace_period: 30s
stop_grace_period: 20s
profiles: [ "all", "auth", "keycloak" ]
command: "start-dev" # development mode, do not use this in production!
links:
- "traefik:localssl.dev"

View file

@ -2,7 +2,7 @@ msgid ""
msgstr ""
"Project-Id-Version: \n"
"Report-Msgid-Bugs-To: ci@photoprism.app\n"
"PO-Revision-Date: 2025-11-11 22:02+0000\n"
"PO-Revision-Date: 2025-11-14 22:02+0000\n"
"Last-Translator: dtsolakis <dtsola@eranet.gr>\n"
"Language: el\n"
"Content-Type: text/plain; charset=UTF-8\n"
@ -88,7 +88,7 @@ msgstr "12 ώρες"
#: src/component/user/edit/dialog.vue:304 src/page/settings/account.vue:168
#: src/component/settings/passcode.vue:25
msgid "2-Factor Authentication"
msgstr "Αυθεντικοποίηση 2 παραγόντων"
msgstr "Αυθεντικοποίηση 2 Παραγόντων"
#: src/component/user/edit/dialog.vue:303 src/options/auth.js:47
msgid "2FA"
@ -246,7 +246,7 @@ msgstr "Προστέθηκε"
#: src/component/location/dialog.vue:21 src/component/location/dialog.vue:26
#: src/component/photo/edit/details.vue:158
msgid "Adjust Location"
msgstr "Ρύθμιση τοποθεσίας"
msgstr "Ρύθμιση Τοποθεσίας"
#: src/options/admin.js:6 src/common/util.js:798 src/options/auth.js:6
msgid "Admin"
@ -254,7 +254,7 @@ msgstr "Διαχειριστής"
#: src/page/settings.vue:86
msgid "Advanced"
msgstr "Σύνθετο"
msgstr "Προηγμένες Ρυθμίσεις"
#: src/options/options.js:378
msgid "After 1 day"
@ -298,7 +298,7 @@ msgstr "Άλμπουμ"
#: src/page/settings/advanced.vue:193
msgid "Album Backups"
msgstr "Αντίγραφα ασφαλείας άλμπουμ"
msgstr "Αντίγραφα Ασφαλείας Άλμπουμ"
#: src/page/albums.vue:1265
msgid "Album created"
@ -374,7 +374,7 @@ msgstr "Όλα τα πρωτότυπα"
#: src/component/photo/toolbar.vue:381 src/page/albums.vue:455
msgid "All Years"
msgstr "Όλα τα έτη"
msgstr "Όλα τα Χρόνια"
#: src/component/share/dialog.vue:134
msgid "Alternatively, you can upload files directly to WebDAV servers like Nextcloud."
@ -440,7 +440,7 @@ msgstr "Έγκριση και αποθήκευση αλλαγών"
#: src/page/settings/account.vue:183 src/component/settings/apps.vue:25
msgid "Apps and Devices"
msgstr "Εφαρμογές και συσκευές"
msgstr "Εφαρμογές και Συσκευές"
#: src/component/lightbox.vue:2315 src/component/photo/edit/info.vue:238
#: src/component/photo/edit/info.vue:239
@ -558,7 +558,7 @@ msgstr "Βιογραφικό"
#: src/page/settings/account.vue:212
msgid "Birth Date"
msgstr "Ημερομηνία γέννησης"
msgstr "Ημερομηνία Γέννησης"
#: src/options/options.js:403
msgid "Black"
@ -697,7 +697,7 @@ msgstr "Αλλαγή Άβαταρ"
#: src/page/settings/account.vue:155 src/component/settings/password.vue:16
msgid "Change Password"
msgstr "Νέος κωδικός πρόσβασης"
msgstr "Αλλαγή Κωδικού Πρόσβασης"
#: src/page/settings/general.vue:328
msgid "Change personal profile and security settings."
@ -733,7 +733,7 @@ msgstr "Οι αλλαγές αποθηκεύτηκαν επιτυχώς"
#: src/page/settings/advanced.vue:16
msgid "Changes to the advanced settings require a restart to take effect."
msgstr "Οι αλλαγές στις ρυθμίσεις για προχωρημένους απαιτούν επανεκκίνηση για να τεθούν σε ισχύ."
msgstr "Οι αλλαγές στις προηγμένες ρυθμίσεις απαιτούν επανεκκίνηση για να τεθούν σε ισχύ."
#: src/component/photo/edit/info.vue:230 src/component/photo/edit/info.vue:231
msgid "Checked"
@ -988,7 +988,7 @@ msgstr "Βάση δεδομένων"
#: src/page/settings/advanced.vue:177
msgid "Database Backups"
msgstr "Αντίγραφα ασφαλείας βάσης δεδομένων"
msgstr "Αντίγραφα Ασφαλείας Βάσης Δεδομένων"
#: src/locales.js:328
msgid "Databases"
@ -1066,7 +1066,7 @@ msgstr "Διαστάσεις"
#: src/page/settings/advanced.vue:84
msgid "Disable Backups"
msgstr "Απενεργοποίηση αντιγράφων ασφαλείας"
msgstr "Απενεργοποίηση Αντιγράφων Ασφαλείας"
#: src/page/settings/advanced.vue:366
msgid "Disable Darktable"
@ -1094,7 +1094,7 @@ msgstr "Απενεργοποίηση των διαδραστικών παγκό
#: src/page/settings/advanced.vue:116
msgid "Disable Places"
msgstr "Απενεργοποίηση Places"
msgstr "Απενεργοποίηση Τοποθεσιών"
#: src/page/settings/advanced.vue:382
msgid "Disable RawTherapee"
@ -1106,7 +1106,7 @@ msgstr "Απενεργοποίηση TensorFlow"
#: src/page/settings/advanced.vue:446
msgid "Disable Vectors"
msgstr "Απενεργοποίηση διανυσμάτων"
msgstr "Απενεργοποίηση Διανυσμάτων"
#: src/page/settings/advanced.vue:100
msgid "Disable WebDAV"
@ -1142,7 +1142,7 @@ msgstr "Απόρριψη"
#: src/page/admin/users.vue:267 src/page/settings/account.vue:76
#: src/page/settings/account.vue:78 src/locales.js:321
msgid "Display Name"
msgstr "Εμφανιζόμενο όνομα"
msgstr "Εμφανιζόμενο Όνομα"
#: src/page/settings/content.vue:170
msgid "Display picture captions in search results."
@ -1247,11 +1247,11 @@ msgstr "Διάρκεια"
#: src/page/settings/advanced.vue:285
msgid "Dynamic Previews"
msgstr "Δυναμικές προεπισκοπήσεις"
msgstr "Δυναμικές Προεπισκοπήσεις"
#: src/page/settings/advanced.vue:261
msgid "Dynamic Size Limit: %{n}px"
msgstr "Όριο δυναμικού μεγέθους: %{n}px"
msgstr "Όριο Δυναμικού Μεγέθους: %{n}px"
#: src/page/about/feedback.vue:80 src/page/about/feedback.vue:79
msgid "E-Mail"
@ -1369,7 +1369,7 @@ msgstr "Η εξαγωγή μεταδεδομένων με το ExifTool απαι
#: src/page/settings/advanced.vue:52
msgid "Experimental Features"
msgstr "Πειραματικά Χαρακτηριστικά"
msgstr "Πειραματικές Λειτουργίες"
#: src/page/admin/sessions.vue:203 src/page/admin/sessions.vue:296
#: src/component/service/edit.vue:69 src/component/settings/apps.vue:160
@ -1416,7 +1416,7 @@ msgstr "Επίθετο"
#: src/options/options.js:222
msgid "Fast"
msgstr "Γρήγορα"
msgstr "Γρήγορο"
#: src/component/album/edit/dialog.vue:91
#: src/component/label/edit/dialog.vue:44
@ -1457,7 +1457,7 @@ msgstr "Πρόγραμμα περιήγησης αρχείων"
#: src/page/settings/advanced.vue:354
msgid "File Conversion"
msgstr "Μετατροπή αρχείου"
msgstr "Μετατροπή Αρχείων"
#: src/component/album/edit/dialog.vue:147 src/component/photo/toolbar.vue:424
#: src/component/photo/toolbar.vue:435 src/component/photo/toolbar.vue:446
@ -1537,7 +1537,7 @@ msgstr "Πλήρης πρόσβαση"
#: src/component/lightbox.vue:1264 src/component/lightbox.vue:1265
msgid "Fullscreen"
msgstr "Πλήρης οθόνη"
msgstr "Πλήρης Οθόνη"
#: src/page/settings.vue:60
msgid "General"
@ -1655,7 +1655,7 @@ msgstr "Εικόνα"
#: src/page/settings/advanced.vue:301
msgid "Image Quality"
msgstr "Ποιότητα εικόνας"
msgstr "Ποιότητα Εικόνας"
#: src/page/library.vue:74 src/page/library/import.vue:44
#: src/page/library/import.vue:45 src/page/library/import.vue:73
@ -1809,7 +1809,7 @@ msgstr "Ποιότητα JPEG: %{n}"
#: src/page/settings/advanced.vue:323
msgid "JPEG Size Limit: %{n}px"
msgstr "Όριο μεγέθους JPEG: %{n}px"
msgstr "Όριο Μεγέθους JPEG: %{n}px"
#: src/page/library/import.vue:58
msgid "JPEGs and thumbnails are automatically rendered as needed."
@ -1856,7 +1856,7 @@ msgstr "Τελευταία φορά ενεργός"
#: src/page/admin/users.vue:276 src/locales.js:335
msgid "Last Login"
msgstr "Τελευταία σύνδεση"
msgstr "Τελευταία Σύνδεση"
#: src/locales.js:235 src/locales.js:293
msgid "Last page"
@ -1938,7 +1938,7 @@ msgstr "Λίστα"
#: src/page/settings/content.vue:141
msgid "List View"
msgstr "Προβολή λίστας"
msgstr "Προβολή σε Λίστα"
#: src/component/photo/view/cards.vue:139
#: src/component/photo/view/cards.vue:280 src/component/photo/view/list.vue:94
@ -1951,7 +1951,7 @@ msgstr "Ζωντανό"
#: src/component/navigation.vue:222 src/component/navigation.vue:237
#: src/component/navigation.vue:331
msgid "Live Photos"
msgstr "Φωτογραφίες"
msgstr "Ζωντανές Εικόνες"
#: src/locales.js:307
msgid "Load more"
@ -2188,7 +2188,7 @@ msgstr "Νέος κωδικός πρόσβασης"
#: src/component/photo/toolbar.vue:431 src/component/photo/toolbar.vue:441
#: src/page/albums.vue:462
msgid "Newest First"
msgstr "Το νεότερο πρώτα"
msgstr "Πρώτα τα πιο Πρόσφατα"
#: src/component/lightbox.vue:412 src/locales.js:297
msgid "Next"
@ -2285,7 +2285,7 @@ msgstr "Οι μη φωτογραφικές εικόνες και οι εικόν
#: src/options/admin.js:51 src/options/auth.js:33 src/options/options.js:218
#: src/options/options.js:334
msgid "None"
msgstr "Κανένα"
msgstr "Καθόλου"
#: src/component/lightbox.vue:786 src/component/service/upload.vue:159
#: src/component/service/upload.vue:171
@ -2345,7 +2345,7 @@ msgstr "OK"
#: src/component/photo/toolbar.vue:432 src/component/photo/toolbar.vue:442
#: src/page/albums.vue:463
msgid "Oldest First"
msgstr "Ο παλαιότερος πρώτος"
msgstr "Πρώτα τα πιο Παλιά"
#: src/component/settings/webdav.vue:17 src/component/settings/webdav.vue:18
#: src/component/settings/webdav.vue:27 src/component/settings/webdav.vue:38
@ -2608,7 +2608,7 @@ msgstr "ΜΜ"
#: src/page/settings/advanced.vue:338
msgid "PNG Size Limit: %{n}px"
msgstr "Όριο μεγέθους PNG: %{n}px"
msgstr "Όριο Μεγέθους PNG: %{n}px"
#: src/locales.js:323
msgid "Portal"
@ -2642,7 +2642,7 @@ msgstr "Προεπισκόπηση"
#: src/page/settings/advanced.vue:222
msgid "Preview Images"
msgstr "Εικόνες προεπισκόπισης"
msgstr "Εικόνες Προεπισκόπισης"
#: src/component/lightbox.vue:411 src/locales.js:298
msgid "Previous"
@ -2719,13 +2719,13 @@ msgstr "Δημιουργία ευρετηρίου όλων των πρωτοτύ
#: src/page/settings/advanced.vue:68
msgid "Read-Only Mode"
msgstr "Λειτουργία μόνο για ανάγνωση"
msgstr "Λειτουργία Μόνο Ανάγνωσης"
#: src/component/album/edit/dialog.vue:145 src/component/photo/toolbar.vue:421
#: src/component/photo/toolbar.vue:433 src/component/photo/toolbar.vue:443
#: src/page/albums.vue:464
msgid "Recently Added"
msgstr "Πρόσφατα προστέθηκε"
msgstr "Πρόσφατες Προσθήκες"
#: src/component/photo/toolbar.vue:422
msgid "Recently Archived"
@ -2742,7 +2742,7 @@ msgstr "Η αναγνώριση ξεκινά μετά την ολοκλήρωσ
#: src/page/settings/general.vue:88
msgid "Recognize faces so people can be assigned and found."
msgstr "Αναγνωρίζει πρόσωπα ώστε να μπορούν να βρεθούν συγκεκριμένα άτομα."
msgstr "Αναγνώριση προσώπων ώστε να μπορούν να ορίζονται και να εντοπίζονται συγκεκριμένα άτομα."
#: src/page/people.vue:61
msgid "Recognized"
@ -3039,7 +3039,7 @@ msgstr "URL υπηρεσίας"
#: src/locales.js:359 src/page/settings.vue:99
#: src/page/settings/general.vue:267
msgid "Services"
msgstr "URL υπηρεσίας"
msgstr "Υπηρεσίες"
#: src/locales.js:342 src/model/session.js:83 src/options/auth.js:42
#: src/options/auth.js:91
@ -3114,7 +3114,7 @@ msgstr "Εμφάνιση όλων των νέων προσώπων"
#: src/page/settings/content.vue:169
msgid "Show Captions"
msgstr "Εμφάνιση λεζάντων"
msgstr "Εμφάνιση Λεζάντων"
#: src/page/people/new.vue:12 src/page/people/recognized.vue:45
msgid "Show hidden"
@ -3130,7 +3130,7 @@ msgstr "Εμφάνιση των αρχείων καταγραφής του δι
#: src/page/settings/content.vue:155
msgid "Show Titles"
msgstr "Εμφάνιση τίτλων"
msgstr "Εμφάνιση Τίτλων"
#: src/model/file.js:190 src/page/settings/content.vue:221
msgid "Sidecar"
@ -3164,7 +3164,7 @@ msgstr "Μέγεθος"
#: src/component/lightbox.vue:1247 src/component/lightbox.vue:1248
msgid "Slideshow"
msgstr "Παρουσίαση διαφανειών"
msgstr "Παρουσίαση"
#: src/options/options.js:230
msgid "Slow"
@ -3266,7 +3266,7 @@ msgstr "Σελίδα έναρξης"
#: src/page/settings/advanced.vue:244
msgid "Static Size Limit: %{n}px"
msgstr "Όριο στατικού μεγέθους: %{n}px"
msgstr "Όριο Στατικού Μεγέθους: %{n}px"
#: src/component/photo/edit/files.vue:463
msgid "Status"
@ -3339,7 +3339,7 @@ msgstr "Γαλαζοπράσινο"
#: src/page/settings/advanced.vue:150
msgid "TensorFlow is required for image classification, facial recognition, and detecting unsafe content."
msgstr "Το TensorFlow απαιτείται για την ταξινόμηση εικόνων, την αναγνώριση προσώπου και την ανίχνευση μη ασφαλούς περιεχομένου."
msgstr "Το TensorFlow απαιτείται για την ταξινόμηση εικόνων, την αναγνώριση προσώπων και την ανίχνευση μη ασφαλούς περιεχομένου."
#: src/options/options.js:267
msgid "Terrain"
@ -3443,7 +3443,7 @@ msgstr "Σήμερα"
#: src/component/album/toolbar.vue:28 src/component/photo/toolbar.vue:55
msgid "Toggle View"
msgstr "Εναλλαγή προβολής"
msgstr "Εναλλαγή Προβολής"
#: src/component/share/dialog.vue:89
msgid "Token"
@ -3606,7 +3606,7 @@ msgstr "Διαδρομή φόρτωσης"
#: src/page/settings/general.vue:209
msgid "Upload to WebDAV and share links with friends."
msgstr "Ανεβάστε σε WebDAV και μοιραστείτε συνδέσμους με φίλους."
msgstr "Ανεβάστε στο WebDAV και μοιραστείτε συνδέσμους με φίλους."
#: src/component/upload/dialog.vue:40
msgid "Uploading %{n} of %{t}…"
@ -3658,7 +3658,7 @@ msgstr "Διεπαφή χρήστη"
#: src/component/service/add.vue:38 src/component/service/edit.vue:172
#: src/component/share/dialog.vue:174 src/locales.js:320
msgid "Username"
msgstr "Όνομα χρήστη"
msgstr "Όνομα Χρήστη"
#: src/component/navigation.vue:400 src/component/navigation.vue:401
#: src/component/navigation.vue:425 src/component/navigation.vue:431

View file

@ -0,0 +1,102 @@
import { mount, config as VTUConfig } from "@vue/test-utils";
import { describe, it, expect, beforeEach } from "vitest";
import { nextTick } from "vue";
import PLightbox from "component/lightbox.vue";
const mountLightbox = () =>
mount(PLightbox, {
global: {
stubs: {
"v-dialog": true,
"v-icon": true,
"v-slider": true,
"p-lightbox-menu": true,
"p-sidebar-info": true,
},
},
});
describe("PLightbox (low-mock, jsdom-friendly)", () => {
beforeEach(() => {
localStorage.removeItem("lightbox.info");
sessionStorage.removeItem("lightbox.muted");
});
it("toggleInfo updates info and localStorage when visible", async () => {
const wrapper = mountLightbox();
await wrapper.setData({ visible: true });
// Use exposed onShortCut to trigger info toggle (KeyI)
await wrapper.vm.onShortCut({ code: "KeyI" });
await nextTick();
expect(localStorage.getItem("lightbox.info")).toBe("true");
await wrapper.vm.onShortCut({ code: "KeyI" });
await nextTick();
expect(localStorage.getItem("lightbox.info")).toBe("false");
});
it("toggleMute writes sessionStorage without requiring video or exposed state", async () => {
const wrapper = mountLightbox();
expect(sessionStorage.getItem("lightbox.muted")).toBeNull();
await wrapper.vm.onShortCut({ code: "KeyM" });
expect(sessionStorage.getItem("lightbox.muted")).toBe("true");
await wrapper.vm.onShortCut({ code: "KeyM" });
expect(sessionStorage.getItem("lightbox.muted")).toBe("false");
});
it("getPadding returns expected structure for large and small screens", async () => {
const wrapper = mountLightbox();
// Large viewport
const large = wrapper.vm.$options.methods.getPadding.call(
wrapper.vm,
{ x: 1200, y: 800 },
{ width: 4000, height: 3000 }
);
expect(large).toHaveProperty("top");
expect(large).toHaveProperty("bottom");
expect(large).toHaveProperty("left");
expect(large).toHaveProperty("right");
// Small viewport (<= mobileBreakpoint) should yield zeros
const small = wrapper.vm.$options.methods.getPadding.call(
wrapper.vm,
{ x: 360, y: 640 },
{ width: 1200, height: 800 }
);
expect(small).toEqual({ top: 0, bottom: 0, left: 0, right: 0 });
});
it("KeyI is ignored when dialog is not visible", async () => {
const wrapper = mountLightbox();
expect(localStorage.getItem("lightbox.info")).toBeNull();
await wrapper.vm.onShortCut({ code: "KeyI" });
expect(localStorage.getItem("lightbox.info")).toBeNull();
});
it("getViewport falls back to window size without content ref", () => {
const wrapper = mountLightbox();
const vp = wrapper.vm.$options.methods.getViewport.call(wrapper.vm);
expect(vp.x).toBeGreaterThan(0);
expect(vp.y).toBeGreaterThan(0);
});
it("menuActions marks Download action visible when allowed", () => {
const wrapper = mountLightbox();
const ctx = {
$gettext: VTUConfig.global.mocks.$gettext,
$pgettext: VTUConfig.global.mocks.$pgettext,
// minimal state needed by menuActions visibility checks
canManageAlbums: false,
canArchive: false,
canDownload: true,
collection: null,
context: "",
model: {},
};
const actions = wrapper.vm.$options.methods.menuActions.call(ctx);
const download = actions.find((a) => a?.name === "download");
expect(download).toBeTruthy();
expect(download.visible).toBe(true);
});
});

View file

@ -38,10 +38,48 @@ if (typeof global.ResizeObserver === "undefined") {
// Configure Vue Test Utils global configuration
config.global.mocks = {
$gettext: (text) => text,
$pgettext: (_ctx, text) => text,
$isRtl: false,
$config: {
feature: (_name) => true,
feature: () => true,
get: () => false,
getSettings: () => ({ features: { edit: true, favorites: true, download: true, archive: true } }),
allow: () => true,
featExperimental: () => false,
featDevelop: () => false,
values: {},
dir: () => "ltr",
},
$event: {
subscribe: () => "sub-id",
subscribeOnce: () => "sub-id-once",
unsubscribe: () => {},
publish: () => {},
},
$view: {
enter: () => {},
leave: () => {},
isActive: () => true,
},
$notify: { success: () => {}, error: () => {}, warn: () => {} },
$fullscreen: {
isSupported: () => true,
isEnabled: () => false,
request: () => Promise.resolve(),
exit: () => Promise.resolve(),
},
$clipboard: { selection: [], has: () => false, toggle: () => {} },
$util: {
hasTouch: () => false,
encodeHTML: (s) => s,
sanitizeHtml: (s) => s,
formatSeconds: (n) => String(n),
formatRemainingSeconds: () => "0",
videoFormat: () => "avc",
videoFormatUrl: () => "/v.mp4",
thumb: () => ({ src: "/t.jpg", w: 100, h: 100 }),
},
$api: { post: vi.fn(), delete: vi.fn(), get: vi.fn() },
};
config.global.plugins = [vuetify];

View file

@ -9,6 +9,9 @@ import (
"io"
"net/http"
"github.com/sirupsen/logrus"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/header"
)
@ -69,6 +72,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
return nil, parseErr
}
if log.IsLevelEnabled(logrus.TraceLevel) {
log.Tracef("vision: response %s", string(body))
}
return parsed, nil
}
@ -89,12 +96,12 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
return apiResponse, nil
}
func decodeOllamaResponse(data []byte) (*ApiResponseOllama, error) {
resp := &ApiResponseOllama{}
func decodeOllamaResponse(data []byte) (*ollama.Response, error) {
resp := &ollama.Response{}
dec := json.NewDecoder(bytes.NewReader(data))
for {
var chunk ApiResponseOllama
var chunk ollama.Response
if err := dec.Decode(&chunk); err != nil {
if errors.Is(err, io.EOF) {
break

View file

@ -8,6 +8,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
@ -49,7 +50,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
var req ApiRequest
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
assert.Equal(t, FormatJSON, req.Format)
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "qwen2.5vl:latest",
Response: `{"labels":[{"name":"test","confidence":0.9,"topicality":0.8}]}`,
}))
@ -72,7 +73,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
})
t.Run("LabelsWithCodeFence", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "gemma3:latest",
Response: "```json\n{\"labels\":[{\"name\":\"lingerie\",\"confidence\":0.81,\"topicality\":0.73}]}\n```\nThe model provided additional commentary.",
}))
@ -95,7 +96,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
})
t.Run("CaptionFallback", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "qwen2.5vl:latest",
Response: "plain text",
}))

View file

@ -1,10 +1,8 @@
package vision
import (
"errors"
"fmt"
"os"
"time"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
@ -12,53 +10,6 @@ import (
"github.com/photoprism/photoprism/pkg/rnd"
)
// ApiResponseOllama represents a Ollama API service response.
type ApiResponseOllama struct {
Id string `yaml:"Id,omitempty" json:"id,omitempty"`
Code int `yaml:"Code,omitempty" json:"code,omitempty"`
Error string `yaml:"Error,omitempty" json:"error,omitempty"`
Model string `yaml:"Model,omitempty" json:"model,omitempty"`
CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
Response string `yaml:"Response,omitempty" json:"response,omitempty"`
Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
Result ApiResult `yaml:"Result,omitempty" json:"result,omitempty"`
}
// Err returns an error if the request has failed.
func (r *ApiResponseOllama) Err() error {
if r == nil {
return errors.New("response is nil")
}
if r.Code >= 400 {
if r.Error != "" {
return errors.New(r.Error)
}
return fmt.Errorf("error %d", r.Code)
} else if r.Result.IsEmpty() {
return errors.New("no result")
}
return nil
}
// HasResult checks if there is at least one result in the response data.
func (r *ApiResponseOllama) HasResult() bool {
if r == nil {
return false
}
return !r.Result.IsEmpty()
}
// NewApiRequestOllama returns a new Ollama API request with the specified images as payload.
func NewApiRequestOllama(images Files, fileScheme scheme.Type) (*ApiRequest, error) {
imagesData := make(Files, len(images))

View file

@ -11,6 +11,8 @@ import (
"github.com/sirupsen/logrus"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
"github.com/photoprism/photoprism/internal/api/download"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/fs"
@ -58,6 +60,11 @@ type ApiRequestOptions struct {
UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"`
UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"`
NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"`
MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"`
Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"`
ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"`
SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"`
CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"`
}
// ApiRequestContext represents a context parameter returned from a previous request.
@ -77,6 +84,7 @@ type ApiRequest struct {
Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"`
Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"`
Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"`
Schema json.RawMessage `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"`
ResponseFormat ApiFormat `form:"-" yaml:"-" json:"-"`
}
@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat {
// JSON returns the request data as JSON-encoded bytes.
func (r *ApiRequest) JSON() ([]byte, error) {
if r == nil {
return nil, errors.New("api request is nil")
}
if r.ResponseFormat == ApiFormatOpenAI {
return r.openAIJSON()
}
return json.Marshal(*r)
}
@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest {
sanitized.Url = sanitizeLogPayload(r.Url)
sanitized.Schema = r.Schema
return sanitized
}
@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool {
return true
}
// openAIJSON converts the request data into an OpenAI Responses API payload.
func (r *ApiRequest) openAIJSON() ([]byte, error) {
detail := openai.DefaultDetail
if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" {
detail = strings.TrimSpace(opts.Detail)
}
messages := make([]openai.InputMessage, 0, 2)
if system := strings.TrimSpace(r.System); system != "" {
messages = append(messages, openai.InputMessage{
Role: "system",
Type: "message",
Content: []openai.ContentItem{
{
Type: openai.ContentTypeText,
Text: system,
},
},
})
}
userContent := make([]openai.ContentItem, 0, len(r.Images)+1)
if prompt := strings.TrimSpace(r.Prompt); prompt != "" {
userContent = append(userContent, openai.ContentItem{
Type: openai.ContentTypeText,
Text: prompt,
})
}
for _, img := range r.Images {
if img == "" {
continue
}
userContent = append(userContent, openai.ContentItem{
Type: openai.ContentTypeImage,
ImageURL: img,
Detail: detail,
})
}
if len(userContent) > 0 {
messages = append(messages, openai.InputMessage{
Role: "user",
Type: "message",
Content: userContent,
})
}
if len(messages) == 0 {
return nil, errors.New("openai request requires at least one message")
}
payload := openai.HTTPRequest{
Model: strings.TrimSpace(r.Model),
Input: messages,
}
if payload.Model == "" {
payload.Model = openai.DefaultModel
}
if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") {
payload.Reasoning = &openai.Reasoning{Effort: "low"}
}
if opts := r.Options; opts != nil {
if opts.MaxOutputTokens > 0 {
payload.MaxOutputTokens = opts.MaxOutputTokens
}
if opts.Temperature > 0 {
payload.Temperature = opts.Temperature
}
if opts.TopP > 0 {
payload.TopP = opts.TopP
}
if opts.PresencePenalty != 0 {
payload.PresencePenalty = opts.PresencePenalty
}
if opts.FrequencyPenalty != 0 {
payload.FrequencyPenalty = opts.FrequencyPenalty
}
}
if format := buildOpenAIResponseFormat(r); format != nil {
payload.Text = &openai.TextOptions{
Format: format,
}
}
return json.Marshal(payload)
}
// buildOpenAIResponseFormat determines which response_format to send to OpenAI.
func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat {
if r == nil {
return nil
}
opts := r.Options
hasSchema := len(r.Schema) > 0
if !hasSchema && (opts == nil || !opts.ForceJson) {
return nil
}
result := &openai.ResponseFormat{}
if hasSchema {
result.Type = openai.ResponseFormatJSONSchema
result.Schema = r.Schema
if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" {
result.Name = strings.TrimSpace(opts.SchemaVersion)
} else {
result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion)
}
} else {
result.Type = openai.ResponseFormatJSONObject
}
return result
}

View file

@ -53,7 +53,11 @@ func captionInternal(images Files, mediaSrc media.Src) (result *CaptionResult, m
apiRequest.System = model.GetSystemPrompt()
apiRequest.Prompt = model.GetPrompt()
apiRequest.Options = model.GetOptions()
if apiRequest.Options == nil {
apiRequest.Options = model.GetOptions()
}
apiRequest.WriteLog()
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {

View file

@ -58,14 +58,15 @@ func init() {
RegisterEngineAlias(EngineVision, EngineInfo{
RequestFormat: ApiFormatVision,
ResponseFormat: ApiFormatVision,
FileScheme: string(scheme.Data),
FileScheme: scheme.Data,
DefaultResolution: DefaultResolution,
})
RegisterEngineAlias(openai.EngineName, EngineInfo{
Uri: "https://api.openai.com/v1/responses",
RequestFormat: ApiFormatOpenAI,
ResponseFormat: ApiFormatOpenAI,
FileScheme: string(scheme.Data),
FileScheme: scheme.Data,
DefaultResolution: openai.DefaultResolution,
})
}
@ -79,6 +80,7 @@ func RegisterEngine(format ApiFormat, engine Engine) {
// EngineInfo describes metadata that can be associated with an engine alias.
type EngineInfo struct {
Uri string
RequestFormat ApiFormat
ResponseFormat ApiFormat
FileScheme string

View file

@ -28,7 +28,7 @@ func init() {
RegisterEngineAlias(ollama.EngineName, EngineInfo{
RequestFormat: ApiFormatOllama,
ResponseFormat: ApiFormatOllama,
FileScheme: string(scheme.Base64),
FileScheme: scheme.Base64,
DefaultResolution: ollama.DefaultResolution,
})
@ -72,7 +72,7 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string {
switch model.Type {
case ModelTypeLabels:
return ollama.LabelsSchema(model.PromptContains("nsfw"))
return ollama.SchemaLabels(model.PromptContains("nsfw"))
}
return ""
@ -134,64 +134,99 @@ func (ollamaParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, stat
return nil, err
}
result := &ApiResponse{
response := &ApiResponse{
Id: req.GetId(),
Code: status,
Model: &Model{Name: ollamaResp.Model},
Result: ApiResult{
Labels: append([]LabelResult{}, ollamaResp.Result.Labels...),
Caption: func() *CaptionResult {
if ollamaResp.Result.Caption != nil {
copyCaption := *ollamaResp.Result.Caption
return &copyCaption
}
return nil
}(),
Labels: convertOllamaLabels(ollamaResp.Result.Labels),
Caption: convertOllamaCaption(ollamaResp.Result.Caption),
},
}
parsedLabels := len(result.Result.Labels) > 0
parsedLabels := len(response.Result.Labels) > 0
if !parsedLabels && strings.TrimSpace(ollamaResp.Response) != "" && req.Format == FormatJSON {
if labels, parseErr := parseOllamaLabels(ollamaResp.Response); parseErr != nil {
log.Debugf("vision: %s (parse ollama labels)", clean.Error(parseErr))
// Qwen3-VL models stream their JSON payload in the "Thinking" field.
fallbackJSON := strings.TrimSpace(ollamaResp.Response)
if fallbackJSON == "" {
fallbackJSON = strings.TrimSpace(ollamaResp.Thinking)
}
if !parsedLabels && fallbackJSON != "" && (req.Format == FormatJSON || strings.HasPrefix(fallbackJSON, "{")) {
if labels, parseErr := parseOllamaLabels(fallbackJSON); parseErr != nil {
log.Warnf("vision: %s (parse ollama labels)", clean.Error(parseErr))
} else if len(labels) > 0 {
result.Result.Labels = append(result.Result.Labels, labels...)
response.Result.Labels = append(response.Result.Labels, labels...)
parsedLabels = true
}
}
if parsedLabels {
filtered := result.Result.Labels[:0]
for i := range result.Result.Labels {
if result.Result.Labels[i].Confidence <= 0 {
result.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
filtered := response.Result.Labels[:0]
for i := range response.Result.Labels {
if response.Result.Labels[i].Confidence <= 0 {
response.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
}
if result.Result.Labels[i].Topicality <= 0 {
result.Result.Labels[i].Topicality = result.Result.Labels[i].Confidence
if response.Result.Labels[i].Topicality <= 0 {
response.Result.Labels[i].Topicality = response.Result.Labels[i].Confidence
}
// Apply thresholds and canonicalize the name.
normalizeLabelResult(&result.Result.Labels[i])
normalizeLabelResult(&response.Result.Labels[i])
if result.Result.Labels[i].Name == "" {
if response.Result.Labels[i].Name == "" {
continue
}
if result.Result.Labels[i].Source == "" {
result.Result.Labels[i].Source = entity.SrcOllama
if response.Result.Labels[i].Source == "" {
response.Result.Labels[i].Source = entity.SrcOllama
}
filtered = append(filtered, result.Result.Labels[i])
filtered = append(filtered, response.Result.Labels[i])
}
result.Result.Labels = filtered
response.Result.Labels = filtered
} else if caption := strings.TrimSpace(ollamaResp.Response); caption != "" {
result.Result.Caption = &CaptionResult{
response.Result.Caption = &CaptionResult{
Text: caption,
Source: entity.SrcOllama,
}
}
return result, nil
return response, nil
}
func convertOllamaLabels(payload []ollama.LabelPayload) []LabelResult {
if len(payload) == 0 {
return nil
}
labels := make([]LabelResult, len(payload))
for i := range payload {
labels[i] = LabelResult{
Name: payload[i].Name,
Source: payload[i].Source,
Priority: payload[i].Priority,
Confidence: payload[i].Confidence,
Topicality: payload[i].Topicality,
Categories: payload[i].Categories,
NSFW: payload[i].NSFW,
NSFWConfidence: payload[i].NSFWConfidence,
}
}
return labels
}
func convertOllamaCaption(payload *ollama.CaptionPayload) *CaptionResult {
if payload == nil {
return nil
}
return &CaptionResult{
Text: payload.Text,
Source: payload.Source,
Confidence: payload.Confidence,
}
}

View file

@ -10,9 +10,9 @@ import (
func TestOllamaDefaultConfidenceApplied(t *testing.T) {
req := &ApiRequest{Format: FormatJSON}
payload := ApiResponseOllama{
Result: ApiResult{
Labels: []LabelResult{{Name: "forest path", Confidence: 0, Topicality: 0}},
payload := ollama.Response{
Result: ollama.ResultPayload{
Labels: []ollama.LabelPayload{{Name: "forest path", Confidence: 0, Topicality: 0}},
},
}
raw, err := json.Marshal(payload)
@ -37,3 +37,46 @@ func TestOllamaDefaultConfidenceApplied(t *testing.T) {
t.Fatalf("expected topicality to default to confidence, got %.2f", resp.Result.Labels[0].Topicality)
}
}
func TestOllamaParserFallbacks(t *testing.T) {
t.Run("ThinkingFieldJSON", func(t *testing.T) {
req := &ApiRequest{Format: FormatJSON}
payload := ollama.Response{
Thinking: `{"labels":[{"name":"cat","confidence":0.9,"topicality":0.8}]}`,
}
raw, err := json.Marshal(payload)
if err != nil {
t.Fatalf("marshal: %v", err)
}
parser := ollamaParser{}
resp, err := parser.Parse(context.Background(), req, raw, 200)
if err != nil {
t.Fatalf("parse failed: %v", err)
}
if len(resp.Result.Labels) != 1 || resp.Result.Labels[0].Name != "Cat" {
t.Fatalf("expected cat label, got %+v", resp.Result.Labels)
}
})
t.Run("JsonPrefixedResponse", func(t *testing.T) {
req := &ApiRequest{} // no explicit format
payload := ollama.Response{
Response: `{"labels":[{"name":"cat","confidence":0.91,"topicality":0.81}]}`,
}
raw, err := json.Marshal(payload)
if err != nil {
t.Fatalf("marshal: %v", err)
}
parser := ollamaParser{}
resp, err := parser.Parse(context.Background(), req, raw, 200)
if err != nil {
t.Fatalf("parse failed: %v", err)
}
if len(resp.Result.Labels) != 1 || resp.Result.Labels[0].Name != "Cat" {
t.Fatalf("expected cat label, got %+v", resp.Result.Labels)
}
})
}

View file

@ -1,18 +1,342 @@
package vision
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/entity"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
// init registers the OpenAI engine alias so models can set Engine: "openai"
// and inherit sensible defaults (request/response formats, file scheme, and
// preferred thumbnail resolution).
// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
type openaiDefaults struct{}
// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
type openaiBuilder struct{}
// openaiParser converts Responses API payloads into ApiResponse instances.
type openaiParser struct{}
func init() {
RegisterEngineAlias(openai.EngineName, EngineInfo{
RequestFormat: ApiFormatOpenAI,
ResponseFormat: ApiFormatOpenAI,
FileScheme: string(scheme.Base64),
DefaultResolution: openai.DefaultResolution,
RegisterEngine(ApiFormatOpenAI, Engine{
Builder: openaiBuilder{},
Parser: openaiParser{},
Defaults: openaiDefaults{},
})
}
// SystemPrompt returns the default OpenAI system prompt for the specified model type.
func (openaiDefaults) SystemPrompt(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeCaption:
return openai.CaptionSystem
case ModelTypeLabels:
return openai.LabelSystem
default:
return ""
}
}
// UserPrompt returns the default OpenAI user prompt for the specified model type.
func (openaiDefaults) UserPrompt(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeCaption:
return openai.CaptionPrompt
case ModelTypeLabels:
if DetectNSFWLabels {
return openai.LabelPromptNSFW
}
return openai.LabelPromptDefault
default:
return ""
}
}
// SchemaTemplate returns the JSON schema template for the model, if applicable.
func (openaiDefaults) SchemaTemplate(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeLabels:
return string(openai.SchemaLabels(model.PromptContains("nsfw")))
default:
return ""
}
}
// Options returns default OpenAI request options for the model.
func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
if model == nil {
return nil
}
switch model.Type {
case ModelTypeCaption:
/*
Options:
Detail: low
MaxOutputTokens: 512
Temperature: 0.1
TopP: 0.9
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
*/
return &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: openai.CaptionMaxTokens,
Temperature: openai.DefaultTemperature,
TopP: openai.DefaultTopP,
}
case ModelTypeLabels:
/*
Options:
Detail: low
MaxOutputTokens: 1024
Temperature: 0.1
ForceJson: true
SchemaVersion: "photoprism_vision_labels_v1"
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
*/
return &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: openai.LabelsMaxTokens,
Temperature: openai.DefaultTemperature,
TopP: openai.DefaultTopP,
ForceJson: true,
}
default:
return nil
}
}
// Build constructs an OpenAI request payload using base64-encoded thumbnails.
func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
if model == nil {
return nil, ErrInvalidModel
}
dataReq, err := NewApiRequestImages(files, scheme.Data)
if err != nil {
return nil, err
}
req := &ApiRequest{
Id: dataReq.Id,
Images: append(Files(nil), dataReq.Images...),
ResponseFormat: ApiFormatOpenAI,
}
if opts := model.GetOptions(); opts != nil {
req.Options = cloneOptions(opts)
if model.Type == ModelTypeCaption {
// Captions default to plain text responses; structured JSON is optional.
req.Options.ForceJson = false
if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
req.Options.MaxOutputTokens = openai.CaptionMaxTokens
}
} else if model.Type == ModelTypeLabels {
if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
req.Options.MaxOutputTokens = openai.LabelsMaxTokens
}
}
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
req.Options.Temperature = 0
req.Options.TopP = 0
}
}
if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
} else {
req.Schema = raw
}
}
return req, nil
}
// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
if status >= 300 {
if msg := openai.ParseErrorMessage(raw); msg != "" {
return nil, fmt.Errorf("openai: %s", msg)
}
return nil, fmt.Errorf("openai: status %d", status)
}
var resp openai.Response
if err := json.Unmarshal(raw, &resp); err != nil {
return nil, err
}
if resp.Error != nil && resp.Error.Message != "" {
return nil, errors.New(resp.Error.Message)
}
result := ApiResult{}
if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
}
}
if result.Caption == nil {
if text := resp.FirstText(); text != "" {
trimmed := strings.TrimSpace(text)
var parsedJSON bool
if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
} else {
parsedJSON = true
}
}
if !parsedJSON && trimmed != "" {
result.Caption = &CaptionResult{
Text: trimmed,
Source: entity.SrcOpenAI,
}
}
}
}
var responseID string
if req != nil {
responseID = req.GetId()
}
modelName := strings.TrimSpace(resp.Model)
if modelName == "" && req != nil {
modelName = strings.TrimSpace(req.Model)
}
return &ApiResponse{
Id: responseID,
Code: status,
Model: &Model{Name: modelName},
Result: result,
}, nil
}
// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
func parseOpenAISchema(schema string) (json.RawMessage, error) {
var raw json.RawMessage
if err := json.Unmarshal([]byte(schema), &raw); err != nil {
return nil, err
}
return normalizeOpenAISchema(raw)
}
// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
// OpenAI's json_schema format requirements.
func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
if len(raw) == 0 {
return raw, nil
}
var doc map[string]any
if err := json.Unmarshal(raw, &doc); err != nil {
// Fallback to the original payload if it isn't a JSON object.
return raw, nil
}
if t, ok := doc["type"]; ok {
if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
return raw, nil
}
}
if _, ok := doc["properties"]; ok {
return raw, nil
}
labels, ok := doc["labels"]
if !ok {
return raw, nil
}
nsfw := false
if items, ok := labels.([]any); ok && len(items) > 0 {
if first, ok := items[0].(map[string]any); ok {
if _, hasNSFW := first["nsfw"]; hasNSFW {
nsfw = true
}
if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
nsfw = true
}
}
}
return openai.SchemaLabels(nsfw), nil
}
// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
if result == nil || len(payload) == 0 {
return nil
}
var envelope struct {
Caption *struct {
Text string `json:"text"`
Confidence float32 `json:"confidence"`
} `json:"caption"`
Labels []LabelResult `json:"labels"`
}
if err := json.Unmarshal(payload, &envelope); err != nil {
return err
}
if envelope.Caption != nil {
text := strings.TrimSpace(envelope.Caption.Text)
if text != "" {
result.Caption = &CaptionResult{
Text: text,
Confidence: envelope.Caption.Confidence,
Source: entity.SrcOpenAI,
}
}
}
if len(envelope.Labels) > 0 {
filtered := envelope.Labels[:0]
for i := range envelope.Labels {
if envelope.Labels[i].Source == "" {
envelope.Labels[i].Source = entity.SrcOpenAI
}
normalizeLabelResult(&envelope.Labels[i])
if envelope.Labels[i].Name == "" {
continue
}
filtered = append(filtered, envelope.Labels[i])
}
result.Labels = append(result.Labels, filtered...)
}
return nil
}

View file

@ -0,0 +1,337 @@
package vision
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
"github.com/photoprism/photoprism/internal/entity"
)
func TestOpenAIBuilderBuild(t *testing.T) {
model := &Model{
Type: ModelTypeLabels,
Name: openai.DefaultModel,
Engine: openai.EngineName,
}
model.ApplyEngineDefaults()
request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
require.NoError(t, err)
require.NotNil(t, request)
assert.Equal(t, ApiFormatOpenAI, request.ResponseFormat)
assert.NotEmpty(t, request.Images)
assert.NotNil(t, request.Options)
assert.Equal(t, openai.DefaultDetail, request.Options.Detail)
assert.True(t, request.Options.ForceJson)
assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.LabelsMaxTokens)
}
func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) {
model := &Model{
Type: ModelTypeCaption,
Name: openai.DefaultModel,
Engine: openai.EngineName,
Options: &ApiRequestOptions{ForceJson: true},
}
model.ApplyEngineDefaults()
request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
require.NoError(t, err)
require.NotNil(t, request)
require.NotNil(t, request.Options)
assert.False(t, request.Options.ForceJson)
assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.CaptionMaxTokens)
}
func TestApiRequestJSONForOpenAI(t *testing.T) {
req := &ApiRequest{
Model: "gpt-5-mini",
System: "system",
Prompt: "describe the scene",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: 128,
Temperature: 0.2,
TopP: 0.8,
ForceJson: true,
},
Schema: json.RawMessage(`{"type":"object","properties":{"caption":{"type":"object"}}}`),
}
payload, err := req.JSON()
require.NoError(t, err)
var decoded struct {
Model string `json:"model"`
Input []struct {
Role string `json:"role"`
Content []struct {
Type string `json:"type"`
} `json:"content"`
} `json:"input"`
Text struct {
Format struct {
Type string `json:"type"`
Name string `json:"name"`
Schema json.RawMessage `json:"schema"`
Strict bool `json:"strict"`
} `json:"format"`
} `json:"text"`
Reasoning struct {
Effort string `json:"effort"`
} `json:"reasoning"`
MaxOutputTokens int `json:"max_output_tokens"`
}
require.NoError(t, json.Unmarshal(payload, &decoded))
assert.Equal(t, "gpt-5-mini", decoded.Model)
require.Len(t, decoded.Input, 2)
assert.Equal(t, "system", decoded.Input[0].Role)
assert.Equal(t, openai.ResponseFormatJSONSchema, decoded.Text.Format.Type)
assert.Equal(t, schema.JsonSchemaName(decoded.Text.Format.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
assert.False(t, decoded.Text.Format.Strict)
assert.NotNil(t, decoded.Text.Format.Schema)
assert.Equal(t, "low", decoded.Reasoning.Effort)
assert.Equal(t, 128, decoded.MaxOutputTokens)
}
func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) {
req := &ApiRequest{
Model: "gpt-5-mini",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: 64,
ForceJson: true,
},
Schema: json.RawMessage(`{"type":"object"}`),
}
payload, err := req.JSON()
require.NoError(t, err)
var decoded struct {
Text struct {
Format struct {
Name string `json:"name"`
} `json:"format"`
} `json:"text"`
}
require.NoError(t, json.Unmarshal(payload, &decoded))
assert.Equal(t, schema.JsonSchemaName(req.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
}
func TestOpenAIParserParsesJSONFromTextPayload(t *testing.T) {
respPayload := `{
"id": "resp_123",
"model": "gpt-5-mini",
"output": [{
"role": "assistant",
"content": [{
"type": "output_text",
"text": "{\"labels\":[{\"name\":\"deer\",\"confidence\":0.98,\"topicality\":0.99}]}"
}]
}]
}`
req := &ApiRequest{
Id: "test",
Model: "gpt-5-mini",
ResponseFormat: ApiFormatOpenAI,
}
resp, err := openaiParser{}.Parse(context.Background(), req, []byte(respPayload), http.StatusOK)
require.NoError(t, err)
require.NotNil(t, resp)
require.Len(t, resp.Result.Labels, 1)
assert.Equal(t, "Deer", resp.Result.Labels[0].Name)
assert.Nil(t, resp.Result.Caption)
}
func TestParseOpenAISchemaLegacyUpgrade(t *testing.T) {
legacy := `{
"labels": [{
"name": "",
"confidence": 0,
"topicality": 0
}]
}`
raw, err := parseOpenAISchema(legacy)
require.NoError(t, err)
var decoded map[string]any
require.NoError(t, json.Unmarshal(raw, &decoded))
assert.Equal(t, "object", decoded["type"])
props, ok := decoded["properties"].(map[string]any)
require.True(t, ok)
labels, ok := props["labels"].(map[string]any)
require.True(t, ok)
assert.Equal(t, "array", labels["type"])
}
func TestParseOpenAISchemaLegacyUpgradeNSFW(t *testing.T) {
legacy := `{
"labels": [{
"name": "",
"confidence": 0,
"topicality": 0,
"nsfw": false,
"nsfw_confidence": 0
}]
}`
raw, err := parseOpenAISchema(legacy)
require.NoError(t, err)
var decoded map[string]any
require.NoError(t, json.Unmarshal(raw, &decoded))
props := decoded["properties"].(map[string]any)
labels := props["labels"].(map[string]any)
items := labels["items"].(map[string]any)
_, hasNSFW := items["properties"].(map[string]any)["nsfw"]
assert.True(t, hasNSFW)
}
func TestPerformApiRequestOpenAISuccess(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var reqPayload struct {
Model string `json:"model"`
}
assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqPayload))
assert.Equal(t, "gpt-5-mini", reqPayload.Model)
response := map[string]any{
"id": "resp_123",
"model": "gpt-5-mini",
"output": []any{
map[string]any{
"role": "assistant",
"content": []any{
map[string]any{
"type": "output_json",
"json": map[string]any{
"caption": map[string]any{
"text": "A cat rests on a windowsill.",
"confidence": 0.91,
},
"labels": []map[string]any{
{
"name": "cat",
"confidence": 0.92,
"topicality": 0.88,
},
},
},
},
},
},
},
}
assert.NoError(t, json.NewEncoder(w).Encode(response))
}))
defer server.Close()
req := &ApiRequest{
Id: "test",
Model: "gpt-5-mini",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
},
Schema: json.RawMessage(`{"type":"object"}`),
}
resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "secret")
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Result.Caption)
assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
assert.Equal(t, "A cat rests on a windowsill.", resp.Result.Caption.Text)
require.Len(t, resp.Result.Labels, 1)
assert.Equal(t, entity.SrcOpenAI, resp.Result.Labels[0].Source)
assert.Equal(t, "Cat", resp.Result.Labels[0].Name)
}
func TestPerformApiRequestOpenAITextFallback(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
response := map[string]any{
"id": "resp_456",
"model": "gpt-5-mini",
"output": []any{
map[string]any{
"role": "assistant",
"content": []any{
map[string]any{
"type": "output_text",
"text": "Two hikers reach the summit at sunset.",
},
},
},
},
}
assert.NoError(t, json.NewEncoder(w).Encode(response))
}))
defer server.Close()
req := &ApiRequest{
Id: "fallback",
Model: "gpt-5-mini",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
},
Schema: nil,
}
resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
require.NoError(t, err)
require.NotNil(t, resp.Result.Caption)
assert.Equal(t, "Two hikers reach the summit at sunset.", resp.Result.Caption.Text)
assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
}
func TestPerformApiRequestOpenAIError(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest)
_ = json.NewEncoder(w).Encode(map[string]any{
"error": map[string]any{
"message": "Invalid image payload",
},
})
}))
defer server.Close()
req := &ApiRequest{
Id: "error",
Model: "gpt-5-mini",
ResponseFormat: ApiFormatOpenAI,
Schema: nil,
Images: []string{"data:image/jpeg;base64,AA=="},
}
_, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
require.Error(t, err)
assert.Contains(t, err.Error(), "Invalid image payload")
}

View file

@ -96,8 +96,10 @@ func labelsInternal(images Files, mediaSrc media.Src, labelSrc entity.Src) (resu
apiRequest.Prompt = prompt
}
if options := model.GetOptions(); options != nil {
apiRequest.Options = options
if apiRequest.Options == nil {
if options := model.GetOptions(); options != nil {
apiRequest.Options = options
}
}
apiRequest.WriteLog()

View file

@ -154,9 +154,11 @@ func (m *Model) EndpointKey() (key string) {
if key = m.Service.EndpointKey(); key != "" {
return key
} else {
return ServiceKey
}
ensureEnv()
return strings.TrimSpace(os.ExpandEnv(ServiceKey))
}
// EndpointFileScheme returns the endpoint API request file scheme type. Nil
@ -348,6 +350,26 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) {
if len(target.Stop) == 0 && len(defaults.Stop) > 0 {
target.Stop = append([]string(nil), defaults.Stop...)
}
if target.MaxOutputTokens <= 0 && defaults.MaxOutputTokens > 0 {
target.MaxOutputTokens = defaults.MaxOutputTokens
}
if strings.TrimSpace(target.Detail) == "" && strings.TrimSpace(defaults.Detail) != "" {
target.Detail = strings.TrimSpace(defaults.Detail)
}
if !target.ForceJson && defaults.ForceJson {
target.ForceJson = true
}
if target.SchemaVersion == "" && defaults.SchemaVersion != "" {
target.SchemaVersion = defaults.SchemaVersion
}
if target.CombineOutputs == "" && defaults.CombineOutputs != "" {
target.CombineOutputs = defaults.CombineOutputs
}
}
func normalizeOptions(opts *ApiRequestOptions) {
@ -422,6 +444,10 @@ func (m *Model) ApplyEngineDefaults() {
}
if info, ok := EngineInfoFor(engine); ok {
if m.Service.Uri == "" {
m.Service.Uri = info.Uri
}
if m.Service.RequestFormat == "" {
m.Service.RequestFormat = info.RequestFormat
}
@ -439,6 +465,10 @@ func (m *Model) ApplyEngineDefaults() {
}
}
if engine == openai.EngineName && strings.TrimSpace(m.Service.Key) == "" {
m.Service.Key = "${OPENAI_API_KEY}"
}
m.Engine = engine
}
@ -490,7 +520,7 @@ func (m *Model) SchemaTemplate() string {
}
if m.schema == "" {
m.schema = visionschema.Labels(m.PromptContains("nsfw"))
m.schema = visionschema.LabelsJson(m.PromptContains("nsfw"))
}
}
})

View file

@ -1,13 +1,17 @@
package vision
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/photoprism/photoprism/internal/ai/tensorflow"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/entity"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
func TestModelGetOptionsDefaultsOllamaLabels(t *testing.T) {
@ -108,6 +112,85 @@ func TestModelApplyEngineDefaultsSetsResolution(t *testing.T) {
}
}
func TestModelApplyEngineDefaultsSetsServiceDefaults(t *testing.T) {
t.Run("OpenAIEngine", func(t *testing.T) {
model := &Model{
Type: ModelTypeCaption,
Engine: openai.EngineName,
}
model.ApplyEngineDefaults()
assert.Equal(t, "https://api.openai.com/v1/responses", model.Service.Uri)
assert.Equal(t, ApiFormatOpenAI, model.Service.RequestFormat)
assert.Equal(t, ApiFormatOpenAI, model.Service.ResponseFormat)
assert.Equal(t, scheme.Data, model.Service.FileScheme)
})
t.Run("PreserveExistingService", func(t *testing.T) {
model := &Model{
Type: ModelTypeCaption,
Engine: openai.EngineName,
Service: Service{
Uri: "https://custom.example",
FileScheme: scheme.Base64,
RequestFormat: ApiFormatOpenAI,
},
}
model.ApplyEngineDefaults()
assert.Equal(t, "https://custom.example", model.Service.Uri)
assert.Equal(t, scheme.Base64, model.Service.FileScheme)
})
}
func TestModelEndpointKeyOpenAIFallbacks(t *testing.T) {
t.Run("EnvFile", func(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "openai.key")
if err := os.WriteFile(path, []byte("from-file\n"), 0o600); err != nil {
t.Fatalf("write key file: %v", err)
}
t.Setenv("OPENAI_API_KEY", "")
t.Setenv("OPENAI_API_KEY_FILE", path)
model := &Model{Type: ModelTypeCaption, Engine: openai.EngineName}
model.ApplyEngineDefaults()
if got := model.EndpointKey(); got != "from-file" {
t.Fatalf("expected file key, got %q", got)
}
})
t.Run("CustomPlaceholder", func(t *testing.T) {
t.Setenv("OPENAI_API_KEY", "env-secret")
model := &Model{Type: ModelTypeCaption, Engine: openai.EngineName}
model.ApplyEngineDefaults()
if got := model.EndpointKey(); got != "env-secret" {
t.Fatalf("expected env secret, got %q", got)
}
model.Service.Key = "${CUSTOM_KEY}"
t.Setenv("CUSTOM_KEY", "custom-secret")
if got := model.EndpointKey(); got != "custom-secret" {
t.Fatalf("expected custom secret, got %q", got)
}
})
t.Run("GlobalFallback", func(t *testing.T) {
prev := ServiceKey
ServiceKey = "${GLOBAL_KEY}"
defer func() { ServiceKey = prev }()
t.Setenv("GLOBAL_KEY", "global-secret")
model := &Model{}
if got := model.EndpointKey(); got != "global-secret" {
t.Fatalf("expected global secret, got %q", got)
}
})
}
func TestModelGetSource(t *testing.T) {
t.Run("NilModel", func(t *testing.T) {
var model *Model
@ -115,21 +198,18 @@ func TestModelGetSource(t *testing.T) {
t.Fatalf("expected SrcAuto for nil model, got %s", src)
}
})
t.Run("EngineAlias", func(t *testing.T) {
model := &Model{Engine: ollama.EngineName}
if src := model.GetSource(); src != entity.SrcOllama {
t.Fatalf("expected SrcOllama, got %s", src)
}
})
t.Run("RequestFormat", func(t *testing.T) {
model := &Model{Service: Service{RequestFormat: ApiFormatOpenAI}}
if src := model.GetSource(); src != entity.SrcOpenAI {
t.Fatalf("expected SrcOpenAI, got %s", src)
}
})
t.Run("DefaultImage", func(t *testing.T) {
model := &Model{}
if src := model.GetSource(); src != entity.SrcImage {

View file

@ -0,0 +1,152 @@
## PhotoPrism — Ollama Engine Integration
**Last Updated:** November 14, 2025
### Overview
This package provides PhotoPrisms native adapter for Ollama-compatible multimodal models. It lets Caption, Labels, and future Generate workflows call locally hosted models without changing worker logic, reusing the shared API client (`internal/ai/vision/api_client.go`) and result types (`LabelResult`, `CaptionResult`). Requests stay inside your infrastructure, rely on base64 thumbnails, and honor the same ACL, timeout, and logging hooks as the default TensorFlow engines.
#### Context & Constraints
- Engine defaults live in `internal/ai/vision/ollama` and are applied whenever a model sets `Engine: ollama`. Aliases map to `ApiFormatOllama`, `scheme.Base64`, and a default 720px thumbnail.
- Responses may arrive as newline-delimited JSON chunks. `decodeOllamaResponse` keeps the most recent chunk, while `parseOllamaLabels` replays plain JSON strings found in `response`.
- Structured JSON is optional for captions but enforced for labels when `Format: json` (default for label models targeting the Ollama engine).
- The adapter never overwrites TensorFlow defaults. If an Ollama call fails, downstream code still has Nasnet, NSFW, and Face models available.
- Workers assume a single-image payload per request. Run `photoprism vision run` to validate multi-image prompts before changing that invariant.
#### Goals
- Let operators opt into local, private LLMs for captions and labels via `vision.yml`.
- Provide safe defaults (prompts, schema, sampling) so most deployments only need to specify `Name`, `Engine`, and `Service.Uri`.
- Surface reproducible logs, metrics, and CLI commands that make it easy to compare Ollama output against TensorFlow/OpenAI engines.
#### Non-Goals
- Managing Ollama itself (model downloads, GPU scheduling, or authentication). Use the Compose profiles provided in the repository.
- Adding new HTTP endpoints or bypassing the existing `photoprism vision` CLI.
- Replacing TensorFlow workers—Ollama engines are additive and opt-in.
### Architecture & Request Flow
1. **Model Selection**`Config.Model(ModelType)` returns the top-most enabled entry. When `Engine: ollama`, `ApplyEngineDefaults()` fills in the request/response format, base64 file scheme, and a 720px resolution unless overridden.
2. **Request Build**`ollamaBuilder.Build` wraps thumbnails with `NewApiRequestOllama`, which encodes them as base64 strings. `Model.Model()` resolves the exact Ollama tag (`gemma3:4b`, `qwen2.5vl:7b`, etc.).
3. **Transport**`PerformApiRequest` uses a single HTTP POST (default timeout 10min). Authentication is optional; provide `Service.Key` if you proxy through an API gateway.
4. **Parsing**`ollamaParser.Parse` converts payloads into `ApiResponse`. It normalizes confidences (`LabelConfidenceDefault = 0.5` when missing), copies NSFW scores, and canonicalizes label names via `normalizeLabelResult`.
5. **Persistence**`entity.SrcOllama` is stamped on labels/captions so UI badges and audits reflect the new source.
### Prompt, Schema, & Options Guidance
- **System Prompts**
- Labels: `LabelSystem` enforces single-word nouns. Set `System` to override; assign `LabelSystemSimple` when you need descriptive phrases.
- Captions: no system prompt by default; rely on user prompt or set one explicitly for stylistic needs.
- **User Prompts**
- Captions use `CaptionPrompt`, which requests one sentence in active voice.
- Labels default to `LabelPromptDefault`; when `DetectNSFWLabels` is true, the adapter swaps in `LabelPromptNSFW`.
- For stricter noun enforcement, set `Prompt` to `LabelPromptStrict`.
- **Schemas**
- Labels rely on `schema.LabelsJson(nsfw)` (simple JSON template). Setting `Format: json` auto-attaches a reminder (`model.SchemaInstructions()`).
- Override via `Schema` (inline YAML) or `SchemaFile`. `PHOTOPRISM_VISION_LABEL_SCHEMA_FILE` always wins if present.
- **Options**
- Labels: default `Temperature` equals `DefaultTemperature` (0.1 unless configured), `TopP=0.9`, `Stop=["\n\n"]`.
- Captions: only `Temperature` is set; other parameters inherit global defaults.
- Custom `Options` merge with engine defaults. Leave `ForceJson=true` for labels so PhotoPrism can reject malformed payloads early.
### Supported Ollama Vision Models
| Model (Ollama Tag) | Size & Footprint | Strengths | JSON & Language Notes | When To Use |
|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `gemma3:4b / 12b / 27b` | 4B/12B/27B parameters, ~3.3GB → 17GB downloads, 128K context | Multimodal text+image reasoning with SigLIP encoder, handles OCR/long documents, supports tool/function calling | Emits structured JSON reliably; >140 languages with strong default English output | High-quality captions + multilingual labels when you have ≥12GB VRAM (4B works on 8GB with Q4_K_M) |
| `qwen2.5vl:7b` | 8.29B params (Q4_K_M) ≈6GB download, 125K context | Excellent charts, GUI grounding, DocVQA, multi-image reasoning, agentic tool use | JSON mode tuned for schema compliance; supports 20+ languages with strong Chinese/English parity | Label extraction for mixed-language archives or UI/diagram analysis |
| `qwen3-vl:2b / 4b / 8b` | Dense 2B/4B/8B tiers (~3GB, ~3.5GB, ~6GB downloads) with native 256K context extendable to 1M; fits single 1224GB GPUs or high-end CPUs (2B) | Spatial + video reasoning upgrades (Interleaved-MRoPE, DeepStack), 32-language OCR, GUI/agent control, long-document ingest | Emits JSON reliably when prompts specify schema; multilingual captions/labels with Thinking variants boosting STEM reasoning | General-purpose captions/labels when you need long-context doc/video support without cloud APIs; 2B for CPU/edge, 4B as balanced default, 8B when accuracy outweighs latency |
| `llama3.2-vision:11b` | 11B params, ~7.8GB download, requires ≥8GB VRAM; 90B variant needs ≥64GB | Strong general reasoning, captioning, OCR, supported by Meta ecosystem tooling | Vision tasks officially supported in English; text-only tasks cover eight major languages | Keep captions consistent with Meta-compatible prompts or when teams already standardize on Llama 3.x |
| `minicpm-v:8b-2.6` | 8B params, ~5.5GB download, 32K context | Optimized for edge GPUs, high OCR accuracy, multi-image/video support, low token count (≈640 tokens for 1.8MP) | Multilingual (EN/ZH/DE/FR/IT/KR). Emits concise JSON but may need stricter stopping sequences | Memory-constrained deployments that still require NSFW/OCR-aware label output |
> Tip: pull models inside the dev container with `docker compose --profile ollama up -d` and then `docker compose exec ollama ollama pull gemma3:4b`. Keep the profile stopped when you do not need extra GPU/CPU load.
> Qwen3-VL models stream their JSON payload via the `thinking` field. PhotoPrism v2025.11+ captures this automatically; if you run older builds, upgrade before enabling these models or responses will appear empty.
### Configuration
#### Environment Variables
- `PHOTOPRISM_VISION_LABEL_SCHEMA_FILE` — Absolute path to a JSON snippet that overrides the default label schema (applies to every Ollama label model).
- `PHOTOPRISM_VISION_YAML` — Custom `vision.yml` path. Keep it synced in Git if you automate deployments.
- `OLLAMA_HOST`, `OLLAMA_MODELS`, `OLLAMA_MAX_QUEUE`, `OLLAMA_NUM_PARALLEL`, etc. — Provided in `compose*.yaml` to tune the Ollama daemon. Adjust `OLLAMA_KEEP_ALIVE` if you want models to stay loaded between worker batches.
- `PHOTOPRISM_LOG_LEVEL=trace` — Enables verbose request/response previews (truncated to avoid leaking images). Use temporarily when debugging parsing issues.
#### `vision.yml` Example
```yaml
Models:
- Type: labels
Name: qwen2.5vl:7b
Engine: ollama
Run: newly-indexed
Resolution: 720
Format: json
Options:
Temperature: 0.05
Stop: ["\n\n"]
ForceJson: true
Service:
Uri: http://ollama:11434/api/generate
RequestFormat: ollama
ResponseFormat: ollama
FileScheme: base64
- Type: caption
Name: gemma3:4b
Engine: ollama
Disabled: false
Options:
Temperature: 0.2
Service:
Uri: http://ollama:11434/api/generate
```
Guidelines:
- Place new entries after the default TensorFlow models so they take precedence while Nasnet/NSFW remain as fallbacks.
- Always specify the exact Ollama tag (`model:version`) so upgrades are deliberate.
- Keep option flags before positional arguments in CLI snippets (`photoprism vision run -m labels --count 1`).
- If you proxy requests (e.g., through Traefik), set `Service.Key` to `Bearer <token>` and configure the proxy to inject/validate it.
### Operational Checklist
- **Scheduling** — Use `Run: newly-indexed` for incremental runs, `Run: manual` for ad-hoc CLI calls, or `Run: on-schedule` when paired with the scheduler. Leave `Run: auto` if you want the worker to decide based on other model states.
- **Timeouts & Retries** — Default timeout is 10minutes (`ServiceTimeout`). Ollama streaming responses complete faster in practice; if you need stricter SLAs, wrap `photoprism vision run` in a job runner and retry failed batches manually.
- **Fallbacks** — Keep Nasnet configured even when Ollama labels are primary. `labels.go` stops at the first successful engine, so duplicates are avoided.
- **Security** — When exposing Ollama beyond localhost, terminate TLS at Traefik and enable API keys. Never return full JSON payloads in logs; rely on trace mode only for debugging and sanitize before sharing.
- **Model Storage** — Bind-mount `./storage/services/ollama:/root/.ollama` (see Compose) so pulled models survive container restarts. Run `docker compose exec ollama ollama list` during deployments to verify availability.
### Observability & Testing
- **CLI Smoke Tests**
- Captions: `photoprism vision run -m caption --count 5 --force`.
- Labels: `photoprism vision run -m labels --count 5 --force`.
- After each run, check `photoprism vision ls` for `source=ollama`.
- **Unit Tests**
- `go test ./internal/ai/vision/ollama ./internal/ai/vision -run Ollama -count=1` covers transport parsing and model defaults.
- Add fixtures under `internal/ai/vision/testdata` when capturing new response shapes; keep files small and anonymized.
- **Logging**
- Set `PHOTOPRISM_LOG_LEVEL=debug` to watch summary lines (“processed labels/caption via ollama”).
- Use `log.Trace` sparingly; it prints truncated JSON blobs for troubleshooting.
- **Metrics**
- `/api/v1/metrics` exposes counts per label source; scrape after a batch to compare throughput with TensorFlow/OpenAI runs.
### Code Map
- `internal/ai/vision/ollama/*.go` — Engine defaults, schema helpers, transport structs.
- `internal/ai/vision/engine_ollama.go` — Builder/parser glue plus label/caption normalization.
- `internal/ai/vision/api_ollama.go` — Base64 payload builder.
- `internal/ai/vision/api_client.go` — Streaming decoder shared among engines.
- `internal/ai/vision/models.go` — Default caption model definition (`gemma3`).
- `compose*.yaml` — Ollama service profile, Traefik labels, and persistent volume wiring.
- `frontend/src/common/util.js` — Maps `src="ollama"` to the correct badge; keep it updated when adding new source strings.
### Next Steps
- [ ] Add formal schema validation (JSON Schema or JTD) so malformed label responses fail fast before normalization.
- [ ] Support multiple thumbnails per request once core workflows confirm the API contract (requires worker + UI changes).
- [ ] Emit per-model latency and success metrics from the vision worker to simplify tuning when several Ollama engines run side-by-side.
- [ ] Mirror any loader changes into PhotoPrism Plus/Pro templates to keep splash + browser checks consistent after enabling external engines.

View file

@ -1,7 +1,5 @@
package ollama
import "github.com/photoprism/photoprism/internal/ai/vision/schema"
const (
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
@ -22,12 +20,3 @@ const (
// DefaultResolution is the default thumbnail size submitted to Ollama models.
DefaultResolution = 720
)
// LabelsSchema returns the canonical label schema string consumed by Ollama models.
func LabelsSchema(nsfw bool) string {
if nsfw {
return schema.LabelsNSFW
} else {
return schema.LabelsDefault
}
}

View file

@ -0,0 +1,14 @@
package ollama
import (
"github.com/photoprism/photoprism/internal/ai/vision/schema"
)
// SchemaLabels returns the canonical label schema string consumed by Ollama models.
//
// Related documentation and references:
// - https://www.alibabacloud.com/help/en/model-studio/json-mode
// - https://www.json.org/json-en.html
func SchemaLabels(nsfw bool) string {
return schema.LabelsJson(nsfw)
}

View file

@ -0,0 +1,80 @@
package ollama
import (
"errors"
"fmt"
"time"
)
// Response encapsulates the subset of the Ollama generate API response we care about.
type Response struct {
ID string `yaml:"Id,omitempty" json:"id,omitempty"`
Code int `yaml:"Code,omitempty" json:"code,omitempty"`
Error string `yaml:"Error,omitempty" json:"error,omitempty"`
Model string `yaml:"Model,omitempty" json:"model,omitempty"`
CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
Response string `yaml:"Response,omitempty" json:"response,omitempty"`
Thinking string `yaml:"Thinking,omitempty" json:"thinking,omitempty"`
Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
Result ResultPayload `yaml:"Result,omitempty" json:"result,omitempty"`
}
// Err returns an error if the request has failed.
func (r *Response) Err() error {
if r == nil {
return errors.New("response is nil")
}
if r.Code >= 400 {
if r.Error != "" {
return errors.New(r.Error)
}
return fmt.Errorf("error %d", r.Code)
} else if len(r.Result.Labels) == 0 && r.Result.Caption == nil {
return errors.New("no result")
}
return nil
}
// HasResult checks if there is at least one result in the response data.
func (r *Response) HasResult() bool {
if r == nil {
return false
}
return len(r.Result.Labels) > 0 || r.Result.Caption != nil
}
// ResultPayload mirrors the structure returned by Ollama for result data.
type ResultPayload struct {
Labels []LabelPayload `json:"labels"`
Caption *CaptionPayload `json:"caption,omitempty"`
}
// LabelPayload represents a single label object emitted by the Ollama adapter.
type LabelPayload struct {
Name string `json:"name"`
Source string `json:"source,omitempty"`
Priority int `json:"priority,omitempty"`
Confidence float32 `json:"confidence,omitempty"`
Topicality float32 `json:"topicality,omitempty"`
Categories []string `json:"categories,omitempty"`
NSFW bool `json:"nsfw,omitempty"`
NSFWConfidence float32 `json:"nsfw_confidence,omitempty"`
}
// CaptionPayload represents the caption object emitted by the Ollama adapter.
type CaptionPayload struct {
Text string `json:"text"`
Source string `json:"source,omitempty"`
Confidence float32 `json:"confidence,omitempty"`
}

View file

@ -0,0 +1,90 @@
package ollama
import (
"testing"
"time"
)
func TestResponseErr(t *testing.T) {
t.Run("NilResponse", func(t *testing.T) {
if err := (*Response)(nil).Err(); err == nil || err.Error() != "response is nil" {
t.Fatalf("expected nil-response error, got %v", err)
}
})
t.Run("HTTPErrorWithMessage", func(t *testing.T) {
resp := &Response{Code: 429, Error: "too many requests"}
if err := resp.Err(); err == nil || err.Error() != "too many requests" {
t.Fatalf("expected message error, got %v", err)
}
})
t.Run("HTTPErrorWithoutMessage", func(t *testing.T) {
resp := &Response{Code: 500}
if err := resp.Err(); err == nil || err.Error() != "error 500" {
t.Fatalf("expected formatted error, got %v", err)
}
})
t.Run("NoResult", func(t *testing.T) {
resp := &Response{Code: 200}
if err := resp.Err(); err == nil || err.Error() != "no result" {
t.Fatalf("expected no-result error, got %v", err)
}
})
t.Run("HasLabels", func(t *testing.T) {
resp := &Response{
Code: 200,
Result: ResultPayload{Labels: []LabelPayload{{Name: "sky"}}},
Model: "qwen",
}
if err := resp.Err(); err != nil {
t.Fatalf("unexpected error: %v", err)
}
})
t.Run("HasCaption", func(t *testing.T) {
resp := &Response{
Code: 200,
Result: ResultPayload{Caption: &CaptionPayload{Text: "Caption"}},
}
if err := resp.Err(); err != nil {
t.Fatalf("unexpected error: %v", err)
}
})
}
func TestResponseHasResult(t *testing.T) {
if (*Response)(nil).HasResult() {
t.Fatal("nil response should not have result")
}
resp := &Response{}
if resp.HasResult() {
t.Fatal("expected false when result payload is empty")
}
resp.Result.Labels = []LabelPayload{{Name: "sun"}}
if !resp.HasResult() {
t.Fatal("expected true when labels present")
}
resp.Result.Labels = nil
resp.Result.Caption = &CaptionPayload{Text: "Sky", Confidence: 0.9}
if !resp.HasResult() {
t.Fatal("expected true when caption present")
}
}
func TestResponseJSONTagsAreOptional(t *testing.T) {
// Guard against accidental breaking changes to essential fields
resp := Response{
ID: "test",
Model: "ollama",
CreatedAt: time.Now(),
}
if resp.ID == "" || resp.Model == "" {
t.Fatalf("response fields should persist, got %+v", resp)
}
}

View file

@ -0,0 +1,128 @@
## PhotoPrism — OpenAI API Integration
**Last Updated:** November 14, 2025
### Overview
This package contains PhotoPrisms adapter for the OpenAI Responses API. It enables existing caption and label workflows (`GenerateCaption`, `GenerateLabels`, and the `photoprism vision run` CLI) to call OpenAI models alongside TensorFlow and Ollama without changing worker or API code. The implementation focuses on predictable results, structured outputs, and clear observability so operators can opt in gradually.
#### Context & Constraints
- OpenAI requests flow through the existing vision client (`internal/ai/vision/api_client.go`) and must honour PhotoPrisms timeout, logging, and ACL rules.
- Structured outputs are preferred but the adapter must gracefully handle free-form text; `output_text` responses are parsed both as JSON and as plain captions.
- Costs should remain predictable: requests are limited to a single 720px thumbnail (`detail=low`) and capped token budgets (512 caption, 1024 labels).
- Secrets are supplied per model (`Service.Key`) with fallbacks to `OPENAI_API_KEY` / `_FILE`. Logs must redact sensitive data.
#### Goals
- Provide drop-in OpenAI support for captions and labels using `vision.yml`.
- Keep configuration ergonomic by auto-populating prompts, schema names, token limits, and sampling defaults.
- Expose enough logging and tests so operators can compare OpenAI output with existing engines before enabling it broadly.
#### Non-Goals
- Introducing a new `generate` model type or combined caption/label endpoint (reserved for a later phase).
- Replacing the default TensorFlow models; they remain active as fallbacks.
- Managing OpenAI billing or quota dashboards beyond surfacing token counts in logs and metrics.
### Prompt, Model, & Schema Guidance
- **Models:** The adapter targets GPT5 vision tiers (e.g. `gpt-5-nano`, `gpt-5-mini`). These models support image inputs, structured outputs, and deterministic settings. Set `Name` to the exact provider identifier so defaults are applied correctly. Caption models share the same configuration surface and run through the same adapter.
- **Prompts:** Defaults live in `defaults.go`. Captions use a single-sentence instruction; labels use `LabelPromptDefault` (or `LabelPromptNSFW` when PhotoPrism requests NSFW metadata). Custom prompts should retain schema reminders so structured outputs stay valid.
- **Schemas:** Labels use the JSON schema returned by `schema.LabelsJsonSchema(nsfw)`; the response format name is derived via `schema.JsonSchemaName` (e.g. `photoprism_vision_labels_v1`). Captions omit schemas unless operators explicitly request a structured format.
- **When to keep defaults:** For most deployments, leaving `System`, `Prompt`, `Schema`, and `Options` unset yields stable output with minimal configuration. Override them only when domain-specific language or custom scoring is necessary, and add regression tests alongside.
Budget-conscious operators can experiment with lighter prompts or lower-resolution thumbnails, but should keep token limits and determinism settings intact to avoid unexpected bills and UI churn.
#### Performance & Cost Estimates
- **Token budgets:** Captions request up to 512 output tokens; labels request up to 1024. Input tokens are typically ≤700 for a single 720px thumbnail plus prompts.
- **Latency:** GPT5 nano/mini vision calls typically complete in 38s, depending on OpenAI region. Including reasoning metadata (`reasoning.effort=low`) has negligible impact but improves traceability.
- **Costs:** Consult OpenAIs pricing for the selected model. Multiply input/output tokens by the published rate. PhotoPrism currently sends one image per request to keep costs linear with photo count.
### Configuration
#### Environment Variables
- `OPENAI_API_KEY` / `OPENAI_API_KEY_FILE` — fallback credentials when a models `Service.Key` is unset.
- Existing `PHOTOPRISM_VISION_*` variables remain authoritative (see the [Getting Started Guide](https://docs.photoprism.app/getting-started/config-options/#computer-vision) for full lists).
#### `vision.yml` Examples
```yaml
Models:
- Type: caption
Name: gpt-5-nano
Engine: openai
Disabled: false # opt in manually
Resolution: 720 # optional; default is 720
Options:
Detail: low # optional; defaults to low
MaxOutputTokens: 512
Service:
Uri: https://api.openai.com/v1/responses
FileScheme: data
Key: ${OPENAI_API_KEY}
- Type: labels
Name: gpt-5-mini
Engine: openai
Disabled: false
Resolution: 720
Options:
Detail: low
MaxOutputTokens: 1024
ForceJson: true # redundant but explicit
Service:
Uri: https://api.openai.com/v1/responses
FileScheme: data
Key: ${OPENAI_API_KEY}
```
Keep TensorFlow entries in place so PhotoPrism falls back when the external service is unavailable.
#### Defaults
- File scheme: `data:` URLs (base64) for all OpenAI models.
- Resolution: 720px thumbnails (`vision.Thumb(ModelTypeCaption|Labels)`).
- Options: `MaxOutputTokens` raised to 512 (caption) / 1024 (labels); `ForceJson=false` for captions, `true` for labels; `reasoning.effort="low"`.
- Sampling: `Temperature` and `TopP` set to `0` for `gpt-5*` models; inherited values (0.1/0.9) remain for other engines. `openaiBuilder.Build` performs this override while preserving the struct defaults for non-OpenAI adapters.
- Schema naming: Automatically derived via `schema.JsonSchemaName`, so operators may omit `SchemaVersion`.
### Documentation
- Label Generation: <https://docs.photoprism.app/developer-guide/vision/label-generation/>
- Caption Generation: <https://docs.photoprism.app/developer-guide/vision/caption-generation/>
- Vision CLI Commands: <https://docs.photoprism.app/developer-guide/vision/cli/>
### Implementation Details
#### Core Concepts
- **Structured outputs:** PhotoPrism leverages OpenAIs structured output capability as documented at <https://platform.openai.com/docs/guides/structured-outputs>. When a JSON schema is supplied, the adapter emits `text.format` with `type: "json_schema"` and a schema name derived from the content. The parser then prefers `output_json`, but also attempts to decode `output_text` payloads that contain JSON objects.
- **Deterministic sampling:** GPT5 models are run with `temperature=0` and `top_p=0` to minimise variance, while still allowing developers to override values in `vision.yml` if needed.
- **Reasoning metadata:** Requests include `reasoning.effort="low"` so OpenAI returns structured reasoning usage counters, helping operators track token consumption.
- **Worker summaries:** The vision worker now logs either “updated …” or “processed … (no metadata changes detected)”, making reruns easy to audit.
#### Rate Limiting
OpenAI calls respect the existing `limiter.Auth` configuration used by the vision service. Failed requests surface standard HTTP errors and are not automatically retried; operators should ensure they have adequate account limits and consider external rate limiting when sharing credentials.
#### Testing & Validation
1. Unit tests: `go test ./internal/ai/vision/openai ./internal/ai/vision -run OpenAI -count=1`. Fixtures under `internal/ai/vision/openai/testdata/` replay real Responses payloads (captions and labels).
2. CLI smoke test: `photoprism vision run -m labels --count 1 --force` with trace logging enabled to inspect sanitised Responses.
3. Compare worker summaries and label sources (`openai`) in the UI or via `photoprism vision ls`.
#### Code Map
- **Adapter & defaults:** `internal/ai/vision/openai` (defaults, schema helpers, transport, tests).
- **Request/response plumbing:** `internal/ai/vision/api_request.go`, `api_client.go`, `engine_openai.go`, `engine_openai_test.go`.
- **Workers & CLI:** `internal/workers/vision.go`, `internal/commands/vision_run.go`.
- **Shared utilities:** `internal/ai/vision/schema`, `pkg/clean`, `pkg/media`.
#### Next Steps
- [ ] Introduce the future `generate` model type that combines captions, labels, and optional markers.
- [ ] Evaluate additional OpenAI models as pricing and capabilities evolve.
- [ ] Expose token usage metrics (input/output/reasoning) via Prometheus once the schema stabilises.

View file

@ -1,6 +1,29 @@
package openai
import "github.com/photoprism/photoprism/internal/ai/vision/schema"
const (
// CaptionSystem defines the default system prompt for caption models.
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
// CaptionPrompt instructs caption models to respond with a single sentence.
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
// LabelSystem defines the system prompt for label generation.
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
// LabelPromptDefault requests general-purpose labels.
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
// LabelPromptNSFW requests labels including NSFW metadata when required.
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
DefaultDetail = "low"
// CaptionMaxTokens suggests the output budget for caption responses.
CaptionMaxTokens = 512
// LabelsMaxTokens suggests the output budget for label responses.
LabelsMaxTokens = 1024
// DefaultTemperature configures deterministic replies.
DefaultTemperature = 0.1
// DefaultTopP limits nucleus sampling.
DefaultTopP = 0.9
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
DefaultSchemaVersion = "v1"
)
var (
// DefaultModel is the model used by default when accessing the OpenAI API.
@ -8,8 +31,3 @@ var (
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
DefaultResolution = 720
)
// LabelsSchema returns the canonical label schema string consumed by OpenAI models.
func LabelsSchema() string {
return schema.LabelsDefault
}

View file

@ -0,0 +1,16 @@
package openai
import (
"encoding/json"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
)
// SchemaLabels returns the canonical labels JSON Schema string consumed by Ollama models.
//
// Related documentation and references:
// - https://platform.openai.com/docs/guides/structured-outputs
// - https://json-schema.org/learn/miscellaneous-examples
func SchemaLabels(nsfw bool) json.RawMessage {
return schema.LabelsJsonSchema(nsfw)
}

View file

@ -0,0 +1,73 @@
{
"id": "resp_0d356718505119f3006916e5d8730881a0b91de2aa700f6196",
"object": "response",
"created_at": 1763108312,
"status": "completed",
"background": false,
"billing": {
"payer": "developer"
},
"error": null,
"incomplete_details": null,
"instructions": null,
"max_output_tokens": 512,
"max_tool_calls": null,
"model": "gpt-5-nano-2025-08-07",
"output": [
{
"id": "rs_0d356718505119f3006916e5d8efd481a0a4f9cc1823cc6c83",
"type": "reasoning",
"summary": []
},
{
"id": "msg_0d356718505119f3006916e5d9433881a0bc79197d2cfc2027",
"type": "message",
"status": "completed",
"content": [
{
"type": "output_text",
"annotations": [],
"logprobs": [],
"text": "A bee gathers nectar from the vibrant red poppy\u2019s center."
}
],
"role": "assistant"
}
],
"parallel_tool_calls": true,
"previous_response_id": null,
"prompt_cache_key": null,
"prompt_cache_retention": null,
"reasoning": {
"effort": "low",
"summary": null
},
"safety_identifier": null,
"service_tier": "default",
"store": true,
"temperature": 1.0,
"text": {
"format": {
"type": "text"
},
"verbosity": "medium"
},
"tool_choice": "auto",
"tools": [],
"top_logprobs": 0,
"top_p": 1.0,
"truncation": "disabled",
"usage": {
"input_tokens": 576,
"input_tokens_details": {
"cached_tokens": 0
},
"output_tokens": 19,
"output_tokens_details": {
"reasoning_tokens": 0
},
"total_tokens": 595
},
"user": null,
"metadata": {}
}

View file

@ -0,0 +1,114 @@
{
"id": "resp_0fa91dfb69b7d644006916ea0b72ac819f84ff3152a38dfcdb",
"object": "response",
"created_at": 1763109387,
"status": "completed",
"background": false,
"billing": {
"payer": "developer"
},
"error": null,
"incomplete_details": null,
"instructions": null,
"max_output_tokens": 1024,
"max_tool_calls": null,
"model": "gpt-5-mini-2025-08-07",
"output": [
{
"id": "rs_0fa91dfb69b7d644006916ea0c3450819f8a13396bf377f474",
"type": "reasoning",
"summary": []
},
{
"id": "msg_0fa91dfb69b7d644006916ea0d2dfc819faf52b11334fc10a4",
"type": "message",
"status": "completed",
"content": [
{
"type": "output_text",
"annotations": [],
"logprobs": [],
"text": "{\"labels\":[{\"name\":\"flower\",\"confidence\":0.99,\"topicality\":0.99},{\"name\":\"bee\",\"confidence\":0.95,\"topicality\":0.95},{\"name\":\"petal\",\"confidence\":0.92,\"topicality\":0.88},{\"name\":\"pollen\",\"confidence\":0.85,\"topicality\":0.8},{\"name\":\"insect\",\"confidence\":0.9,\"topicality\":0.85},{\"name\":\"red\",\"confidence\":0.88,\"topicality\":0.6},{\"name\":\"close-up\",\"confidence\":0.86,\"topicality\":0.7},{\"name\":\"nature\",\"confidence\":0.8,\"topicality\":0.5}]}"
}
],
"role": "assistant"
}
],
"parallel_tool_calls": true,
"previous_response_id": null,
"prompt_cache_key": null,
"prompt_cache_retention": null,
"reasoning": {
"effort": "low",
"summary": null
},
"safety_identifier": null,
"service_tier": "default",
"store": true,
"temperature": 1.0,
"text": {
"format": {
"type": "json_schema",
"description": null,
"name": "photoprism_vision_labels_v1",
"schema": {
"type": "object",
"properties": {
"labels": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"topicality": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": [
"name",
"confidence",
"topicality"
],
"additionalProperties": false
},
"default": []
}
},
"required": [
"labels"
],
"additionalProperties": false
},
"strict": true
},
"verbosity": "medium"
},
"tool_choice": "auto",
"tools": [],
"top_logprobs": 0,
"top_p": 1.0,
"truncation": "disabled",
"usage": {
"input_tokens": 724,
"input_tokens_details": {
"cached_tokens": 0
},
"output_tokens": 169,
"output_tokens_details": {
"reasoning_tokens": 0
},
"total_tokens": 893
},
"user": null,
"metadata": {}
}

View file

@ -0,0 +1,142 @@
package openai
import (
"encoding/json"
"strings"
)
const (
// ContentTypeText identifies text input segments for the Responses API.
ContentTypeText = "input_text"
// ContentTypeImage identifies image input segments for the Responses API.
ContentTypeImage = "input_image"
// ResponseFormatJSONSchema requests JSON constrained by a schema.
ResponseFormatJSONSchema = "json_schema"
// ResponseFormatJSONObject requests a free-form JSON object.
ResponseFormatJSONObject = "json_object"
)
// HTTPRequest represents the payload expected by OpenAI's Responses API.
type HTTPRequest struct {
Model string `json:"model"`
Input []InputMessage `json:"input"`
Text *TextOptions `json:"text,omitempty"`
Reasoning *Reasoning `json:"reasoning,omitempty"`
MaxOutputTokens int `json:"max_output_tokens,omitempty"`
Temperature float64 `json:"temperature,omitempty"`
TopP float64 `json:"top_p,omitempty"`
PresencePenalty float64 `json:"presence_penalty,omitempty"`
FrequencyPenalty float64 `json:"frequency_penalty,omitempty"`
}
// TextOptions carries formatting preferences for textual responses.
type TextOptions struct {
Format *ResponseFormat `json:"format,omitempty"`
}
// Reasoning configures the effort level for reasoning models.
type Reasoning struct {
Effort string `json:"effort,omitempty"`
}
// InputMessage captures a single system or user message in the request.
type InputMessage struct {
Role string `json:"role"`
Type string `json:"type,omitempty"`
Content []ContentItem `json:"content"`
}
// ContentItem represents a text or image entry within a message.
type ContentItem struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
ImageURL string `json:"image_url,omitempty"`
Detail string `json:"detail,omitempty"`
}
// ResponseFormat describes how OpenAI should format its response.
type ResponseFormat struct {
Type string `json:"type"`
Name string `json:"name,omitempty"`
Schema json.RawMessage `json:"schema,omitempty"`
Description string `json:"description,omitempty"`
Strict bool `json:"strict,omitempty"`
}
// Response mirrors the subset of the Responses API response we need.
type Response struct {
ID string `json:"id"`
Model string `json:"model"`
Output []ResponseOutput `json:"output"`
Error *struct {
Message string `json:"message"`
Type string `json:"type"`
} `json:"error,omitempty"`
}
// ResponseOutput captures assistant messages within the response.
type ResponseOutput struct {
Role string `json:"role"`
Content []ResponseContent `json:"content"`
}
// ResponseContent contains individual message parts (JSON or text).
type ResponseContent struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
JSON json.RawMessage `json:"json,omitempty"`
}
// FirstJSON returns the first JSON payload contained in the response.
func (r *Response) FirstJSON() json.RawMessage {
if r == nil {
return nil
}
for i := range r.Output {
for j := range r.Output[i].Content {
if len(r.Output[i].Content[j].JSON) > 0 {
return r.Output[i].Content[j].JSON
}
}
}
return nil
}
// FirstText returns the first textual payload contained in the response.
func (r *Response) FirstText() string {
if r == nil {
return ""
}
for i := range r.Output {
for j := range r.Output[i].Content {
if text := strings.TrimSpace(r.Output[i].Content[j].Text); text != "" {
return text
}
}
}
return ""
}
// ParseErrorMessage extracts a human readable error message from a Responses API payload.
func ParseErrorMessage(raw []byte) string {
var errResp struct {
Error *struct {
Message string `json:"message"`
} `json:"error"`
}
if err := json.Unmarshal(raw, &errResp); err != nil {
return ""
}
if errResp.Error != nil {
return strings.TrimSpace(errResp.Error.Message)
}
return ""
}

View file

@ -0,0 +1,120 @@
package openai
import (
"encoding/json"
"os"
"path/filepath"
"testing"
)
func loadTestResponse(t *testing.T, name string) *Response {
t.Helper()
filePath := filepath.Join("testdata", name)
data, err := os.ReadFile(filePath)
if err != nil {
t.Fatalf("failed to read %s: %v", filePath, err)
}
var resp Response
if err := json.Unmarshal(data, &resp); err != nil {
t.Fatalf("failed to unmarshal %s: %v", filePath, err)
}
return &resp
}
func TestParseErrorMessage(t *testing.T) {
t.Run("returns message when present", func(t *testing.T) {
raw := []byte(`{"error":{"message":"Invalid schema"}}`)
msg := ParseErrorMessage(raw)
if msg != "Invalid schema" {
t.Fatalf("expected message, got %q", msg)
}
})
t.Run("returns empty string when error is missing", func(t *testing.T) {
raw := []byte(`{"output":[]}`)
if msg := ParseErrorMessage(raw); msg != "" {
t.Fatalf("expected empty message, got %q", msg)
}
})
}
func TestResponseFirstTextCaption(t *testing.T) {
resp := loadTestResponse(t, "caption-response.json")
if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
}
text := resp.FirstText()
expected := "A bee gathers nectar from the vibrant red poppys center."
if text != expected {
t.Fatalf("unexpected caption text: %q", text)
}
}
func TestResponseFirstTextLabels(t *testing.T) {
resp := loadTestResponse(t, "labels-response.json")
if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
}
text := resp.FirstText()
if len(text) == 0 {
t.Fatal("expected structured JSON string in text payload")
}
if text[0] != '{' {
t.Fatalf("expected JSON object in text payload, got %q", text)
}
}
func TestResponseFirstJSONFromStructuredPayload(t *testing.T) {
resp := &Response{
ID: "resp_structured",
Model: "gpt-5-mini",
Output: []ResponseOutput{
{
Role: "assistant",
Content: []ResponseContent{
{
Type: "output_json",
JSON: json.RawMessage(`{"labels":[{"name":"sunset"}]}`),
},
},
},
},
}
jsonPayload := resp.FirstJSON()
if len(jsonPayload) == 0 {
t.Fatal("expected JSON payload, got empty result")
}
var decoded struct {
Labels []map[string]string `json:"labels"`
}
if err := json.Unmarshal(jsonPayload, &decoded); err != nil {
t.Fatalf("failed to decode JSON payload: %v", err)
}
if len(decoded.Labels) != 1 || decoded.Labels[0]["name"] != "sunset" {
t.Fatalf("unexpected JSON payload: %+v", decoded.Labels)
}
}
func TestSchemaLabelsReturnsValidJSON(t *testing.T) {
raw := SchemaLabels(false)
var decoded map[string]any
if err := json.Unmarshal(raw, &decoded); err != nil {
t.Fatalf("schema should be valid JSON: %v", err)
}
if decoded["type"] != "object" {
t.Fatalf("expected type object, got %v", decoded["type"])
}
}

View file

@ -0,0 +1,52 @@
## PhotoPrism — Vision Schema Reference
**Last Updated:** November 14, 2025
### Overview
This package contains the canonical label response specifications used by PhotoPrisms external vision engines. It exposes two helpers:
- `LabelsJsonSchema(nsfw bool)` — returns a JSON **Schema** document tailored for OpenAI Responses requests, enabling strict validation of structured outputs.
- `LabelsJson(nsfw bool)` — returns a literal JSON **sample** that Ollama-style models can mirror when they only support prompt-enforced structures.
Both helpers build on the same field set (`name`, `confidence`, `topicality`, and optional NSFW flags) so downstream parsing logic (`LabelResult`) can remain engine-agnostic.
### Schema Types & Differences
| Helper | Target Engine | Format | Validation Style | When To Use |
|---------------------------|--------------------------|--------------------------------------------------------|-------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|
| `LabelsJsonSchema(false)` | OpenAI (standard labels) | JSON Schema Draft | Strong: OpenAI enforces field types/ranges server-side before returning a response. | When calling GPTvision models via `ApiFormatOpenAI` to ensure PhotoPrism receives well-formed label arrays. |
| `LabelsJsonSchema(true)` | OpenAI (labels + NSFW) | JSON Schema Draft with additional boolean/float fields | Strong: same enforcement plus required NSFW fields. | When `DetectNSFWLabels` or NSFW-specific prompts are active and the model must emit `nsfw` + `nsfw_confidence`. |
| `LabelsJson(false)` | Ollama (standard labels) | Plain JSON example | Soft: model is nudged to mimic the structure through prompt instructions. | When running self-hosted Ollama models that support “JSON mode” but do not consume JSON Schema definitions. |
| `LabelsJson(true)` | Ollama (labels + NSFW) | Plain JSON example with NSFW keys | Soft: prompts describe the required keys; the adapter validates after parsing. | When Ollama prompts mention NSFW scoring or PhotoPrism sets `DetectNSFWLabels=true`. |
**Key technical distinction:** OpenAIs Responses API accepts a JSON Schema (see `LabelsJsonSchema*`) and guarantees compliance by rejecting invalid responses, while Ollama currently relies on prompt-directed output. For Ollama integrations we provide a representative JSON document (`LabelsJson*`) that models can imitate; PhotoPrism then normalizes and validates the results in Go.
### Field Definitions
- `name` — single-word noun describing the subject (string, required).
- `confidence` — normalized score between `0` and `1` (float, required).
- `topicality` — relative relevance score between `0` and `1` (float, required; defaults to `confidence` if omitted after parsing).
- `nsfw` — boolean flag indicating sensitive content (required only in NSFW variants).
- `nsfw_confidence` — normalized probability for the NSFW assessment (required only in NSFW variants).
OpenAI schemas enforce these ranges/types, while Ollama prompts remind the model to emit matching keys. After parsing, PhotoPrism applies `LabelConfidenceDefault` and `normalizeLabelResult` to fill gaps and enforce naming rules.
### Usage Guidance
1. **OpenAI models** (`Engine: openai`, `RequestFormat: openai`):
- Leave `Schema` unset in `vision.yml`; the engine defaults call `LabelsJsonSchema(model.PromptContains("nsfw"))`.
- Optionally override the schema via `Schema`/`SchemaFile` if you extend fields, but keep required keys so `LabelResult` parsing succeeds.
2. **Ollama models** (`Engine: ollama`, `RequestFormat: ollama`):
- Rely on the built-in samples from `LabelsJson` or include them directly in prompts via `model.SchemaInstructions()`.
- Because enforcement happens after the response arrives, keep `Format: json` (default) and `Options.ForceJson=true` for label models to make parsing stricter.
3. **Custom engines**:
- Reuse these helpers to stay compatible with PhotoPrisms label DTOs.
- When adding new fields, update both schema/sample versions so OpenAI and Ollama adapters remain aligned.
### References
- JSON Schema primer: https://json-schema.org/learn/miscellaneous-examples
- OpenAI structured outputs: https://platform.openai.com/docs/guides/structured-outputs
- JSON mode background (Ollama-style prompts): https://www.alibabacloud.com/help/en/model-studio/json-mode
- JSON syntax refresher: https://www.json.org/json-en.html

View file

@ -1,16 +1,115 @@
package schema
// LabelsDefault provides the minimal JSON schema for label responses used across engines.
const (
LabelsDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
LabelsNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
import (
"encoding/json"
)
// Labels returns the canonical label schema string.
func Labels(nsfw bool) string {
// LabelsJsonSchemaDefault provides the minimal JSON schema for label responses used across engines.
const (
LabelsJsonSchemaDefault = `{
"type": "object",
"properties": {
"labels": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"topicality": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": ["name", "confidence", "topicality"],
"additionalProperties": false
},
"default": []
}
},
"required": ["labels"],
"additionalProperties": false
}`
LabelsJsonDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
LabelsJsonSchemaNSFW = `{
"type": "object",
"properties": {
"labels": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"topicality": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"nsfw": {
"type": "boolean"
},
"nsfw_confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": [
"name",
"confidence",
"topicality",
"nsfw",
"nsfw_confidence"
],
"additionalProperties": false
},
"default": []
}
},
"required": ["labels"],
"additionalProperties": false
}`
LabelsJsonNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
)
// LabelsJsonSchema returns the canonical label JSON Schema string for OpenAI API endpoints.
//
// Related documentation and references:
// - https://platform.openai.com/docs/guides/structured-outputs
// - https://json-schema.org/learn/miscellaneous-examples
func LabelsJsonSchema(nsfw bool) json.RawMessage {
if nsfw {
return LabelsNSFW
return json.RawMessage(LabelsJsonSchemaNSFW)
} else {
return LabelsDefault
return json.RawMessage(LabelsJsonSchemaDefault)
}
}
// LabelsJson returns the canonical label JSON string for Ollama vision models.
//
// Related documentation and references:
// - https://www.alibabacloud.com/help/en/model-studio/json-mode
// - https://www.json.org/json-en.html
func LabelsJson(nsfw bool) string {
if nsfw {
return LabelsJsonNSFW
} else {
return LabelsJsonDefault
}
}

View file

@ -0,0 +1,36 @@
package schema
import (
"bytes"
"encoding/json"
"fmt"
"github.com/photoprism/photoprism/pkg/clean"
)
const (
NamePrefix = "photoprism_vision"
)
// JsonSchemaName returns the schema version string to be used for API requests.
func JsonSchemaName(schema json.RawMessage, version string) string {
var schemaName string
switch {
case bytes.Contains(schema, []byte("labels")):
schemaName = "labels"
case bytes.Contains(schema, []byte("labels")):
schemaName = "caption"
default:
schemaName = "schema"
}
version = clean.TypeLowerUnderscore(version)
if version == "" {
version = "v1"
}
return fmt.Sprintf("%s_%s_%s", NamePrefix, schemaName, version)
}

View file

@ -0,0 +1,23 @@
package schema
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/assert"
)
func TestJsonSchemaName(t *testing.T) {
t.Run("Default", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_schema_v1", JsonSchemaName(nil, ""))
})
t.Run("Labels", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(json.RawMessage(LabelsJsonSchemaDefault), ""))
})
t.Run("LabelsV1", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_labels_v2", JsonSchemaName([]byte("labels"), "v2"))
})
t.Run("LabelsJsonSchema", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(LabelsJsonSchema(false), "v1"))
})
}

View file

@ -1,5 +1,5 @@
/*
Package schema defines canonical JSON schema templates shared by PhotoPrism's AI vision engines.
Package schema defines canonical JSON and JSON Schema templates shared by PhotoPrism's AI vision engines.
Copyright (c) 2018 - 2025 PhotoPrism UG. All rights reserved.

View file

@ -1,6 +1,9 @@
package vision
import (
"os"
"strings"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
@ -36,7 +39,9 @@ func (m *Service) EndpointKey() string {
return ""
}
return m.Key
ensureEnv()
return strings.TrimSpace(os.ExpandEnv(m.Key))
}
// EndpointFileScheme returns the endpoint API file scheme type.

View file

@ -9,14 +9,12 @@ func TestThresholds_GetConfidence(t *testing.T) {
t.Fatalf("expected 0, got %d", got)
}
})
t.Run("AboveMax", func(t *testing.T) {
th := Thresholds{Confidence: 150}
if got := th.GetConfidence(); got != 1 {
t.Fatalf("expected 1, got %d", got)
}
})
t.Run("Float", func(t *testing.T) {
th := Thresholds{Confidence: 25}
if got := th.GetConfidenceFloat32(); got != 0.25 {
@ -32,14 +30,12 @@ func TestThresholds_GetTopicality(t *testing.T) {
t.Fatalf("expected 0, got %d", got)
}
})
t.Run("AboveMax", func(t *testing.T) {
th := Thresholds{Topicality: 300}
if got := th.GetTopicality(); got != 1 {
t.Fatalf("expected 1, got %d", got)
}
})
t.Run("Float", func(t *testing.T) {
th := Thresholds{Topicality: 45}
if got := th.GetTopicalityFloat32(); got != 0.45 {
@ -55,14 +51,12 @@ func TestThresholds_GetNSFW(t *testing.T) {
t.Fatalf("expected default %d, got %d", DefaultThresholds.NSFW, got)
}
})
t.Run("AboveMax", func(t *testing.T) {
th := Thresholds{NSFW: 200}
if got := th.GetNSFW(); got != 1 {
t.Fatalf("expected 1, got %d", got)
}
})
t.Run("Float", func(t *testing.T) {
th := Thresholds{NSFW: 80}
if got := th.GetNSFWFloat32(); got != 0.8 {

View file

@ -25,7 +25,34 @@ Additional information can be found in our Developer Guide:
package vision
import (
"os"
"strings"
"sync"
"github.com/photoprism/photoprism/internal/event"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/fs"
)
var log = event.Log
var ensureEnvOnce sync.Once
// ensureEnv loads environment-backed credentials once so adapters can look up
// OPENAI_API_KEY even when operators rely on OPENAI_API_KEY_FILE. Future engine
// integrations can reuse this hook to normalise additional secrets.
func ensureEnv() {
ensureEnvOnce.Do(func() {
if os.Getenv("OPENAI_API_KEY") != "" {
return
}
if path := strings.TrimSpace(os.Getenv("OPENAI_API_KEY_FILE")); fs.FileExistsNotEmpty(path) {
if data, err := os.ReadFile(path); err == nil {
if key := clean.Auth(string(data)); key != "" {
_ = os.Setenv("OPENAI_API_KEY", key)
}
}
}
})
}

View file

@ -339,7 +339,14 @@ func OIDCRedirect(router *gin.RouterGroup) {
sess.SetAuthID(user.AuthID, provider.Issuer())
sess.SetUser(user)
sess.SetGrantType(authn.GrantAuthorizationCode)
sess.IdToken = tokens.IDToken
// Ensure that the ID token fits into the existing
// database column; otherwise, truncate it.
if n := len(tokens.IDToken); n > 2048 {
sess.IdToken = tokens.IDToken[:2048]
} else {
sess.IdToken = tokens.IDToken
}
// Set session expiration and timeout.
sess.SetExpiresIn(unix.Day)

View file

@ -4542,6 +4542,12 @@
"prompt": {
"type": "string"
},
"schema": {
"items": {
"type": "integer"
},
"type": "array"
},
"stream": {
"type": "boolean"
},
@ -4562,6 +4568,15 @@
},
"vision.ApiRequestOptions": {
"properties": {
"combine_outputs": {
"type": "string"
},
"detail": {
"type": "string"
},
"force_json": {
"type": "boolean"
},
"frequency_penalty": {
"type": "number"
},
@ -4571,6 +4586,9 @@
"main_gpu": {
"type": "integer"
},
"max_output_tokens": {
"type": "integer"
},
"min_p": {
"type": "number"
},
@ -4616,6 +4634,9 @@
"repeat_penalty": {
"type": "number"
},
"schema_version": {
"type": "string"
},
"seed": {
"type": "integer"
},

27
internal/config/README.md Normal file
View file

@ -0,0 +1,27 @@
# Config Package Guide
## Overview
PhotoPrisms runtime configuration is managed by this package. Fields are defined in [`options.go`](options.go) and then initialized with values from command-line flags, environment variables, and optional YAML files (`storage/config/*.yml`).
## Sources and Precedence
PhotoPrism loads configuration in the following order:
1. **Built-in defaults** defined in this package.
2. **`defaults.yml`** — optional system defaults (typically `/etc/photoprism/defaults.yml`). See [Global Config Defaults](https://docs.photoprism.app/getting-started/config-files/defaults/) if you package PhotoPrism for other environments and need to override the compiled defaults.
3. **Environment variables** prefixed with `PHOTOPRISM_…` and specified in [`flags.go`](flags.go) along with the CLI flags. This is the primary override mechanism in container environments.
4. **`options.yml`** — user-level configuration stored under `storage/config/options.yml` (or another directory controlled by `PHOTOPRISM_CONFIG_PATH`). Values here override both defaults and environment variables, see [Config Files](https://docs.photoprism.app/getting-started/config-files/).
5. **CLI flags** (for example `photoprism --cache-path=/tmp/cache`). Flags always win when a conflict exists.
The `PHOTOPRISM_CONFIG_PATH` variable controls where PhotoPrism looks for YAML files (defaults to `storage/config`).
> Any change to configuration (flags, env vars, YAML files) requires a restart. The Go process reads options during startup and does not watch for changes.
## CLI Reference
- `photoprism help` (or `photoprism --help`) lists all subcommands and global flags.
- `photoprism show config` renders every active option along with its current value. Pass `--json`, `--md`, `--tsv`, or `--csv` to change the output format.
- `photoprism show config-options` prints the description and default value for each option. Use this when updating [`flags.go`](flags.go).
- `photoprism show config-yaml` displays the configuration keys and their expected types in the [same structure that the YAML files use](https://docs.photoprism.app/getting-started/config-files/). It is a read-only helper meant to guide you when editing files under `storage/config`.
- Additional `show` subcommands document search filters, metadata tags, and supported thumbnail sizes; see [`internal/commands/show.go`](../commands/show.go) for the complete list.

View file

@ -4,5 +4,5 @@ package feat
var (
VisionModelGenerate = false // controls exposure of the generate endpoint and CLI commands
VisionModelMarkers = false // gates marker generation/return until downstream UI and reconciliation paths are ready
VisionServiceOpenAI = false // controls whether users are able to configure OpenAI as a vision service engine
VisionServiceOpenAI = true // controls whether users are able to configure OpenAI as a vision service engine
)

View file

@ -135,6 +135,7 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
done := make(map[string]bool)
offset := 0
updated := 0
processed := 0
// Make sure count is within
if count < 1 || count > search.MaxResults {
@ -197,6 +198,8 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
continue
}
processed++
fileName := photoprism.FileName(photo.FileRoot, photo.FileName)
file, fileErr := photoprism.NewMediaFile(fileName)
@ -279,7 +282,18 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
}
}
log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), time.Since(start))
elapsed := time.Since(start)
switch {
case processed == 0:
log.Infof("vision: no pictures required processing [%s]", elapsed)
case updated == processed:
log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), elapsed)
case updated == 0:
log.Infof("vision: processed %s (no metadata changes detected) [%s]", english.Plural(processed, "picture", "pictures"), elapsed)
default:
log.Infof("vision: updated %s out of %s [%s]", english.Plural(updated, "picture", "pictures"), english.Plural(processed, "picture", "pictures"), elapsed)
}
if updated > 0 {
updateIndex = true

View file

@ -26,13 +26,13 @@ func TestASCII(t *testing.T) {
}
func BenchmarkASCII(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
ASCII("https://docs.photoprism.app/getting-started 👍/config-options/#file-converters")
}
}
func BenchmarkASCIIEmpty(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
ASCII("")
}
}

View file

@ -13,7 +13,7 @@ var DomainRegexp = regexp.MustCompile("^(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\
// Auth returns the sanitized authentication identifier trimmed to a maximum length of 255 characters.
func Auth(s string) string {
if s == "" || len(s) > 2048 {
if s == "" || len(s) > 510 {
return ""
}

View file

@ -43,6 +43,12 @@ func TestAuth(t *testing.T) {
t.Run("TeLessThanSGreaterThanT", func(t *testing.T) {
assert.Equal(t, "Test", Auth("Te<s>t"))
})
t.Run("ApiKey", func(t *testing.T) {
assert.Equal(t,
"ab-prot-keech1aqu8quamiNaecuisuem1ahg7dieph8eitohzo7hoo7pe-Chohzu4eaA-Chohzu4ea-soh7Seic8eig9joojaeshe4Ahsu8zeibooCh9ooquaaleev3poLeev0su9jei2yeich3ahsi9quar1oqueic",
Auth("ab-prot-keech1aqu8quamiNaecuisuem1ahg7dieph8eitohzo7hoo7pe-Chohzu4eaA-Chohzu4ea-soh7Seic8eig9joojaeshe4Ahsu8zeibooCh9ooquaaleev3poLeev0su9jei2yeich3ahsi9quar1oqueic"),
)
})
}
func TestHandle(t *testing.T) {

View file

@ -27,13 +27,13 @@ func TestHeader(t *testing.T) {
}
func BenchmarkHeader(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
Header("https://..docs.photoprism.app/gettin\\g-started/config-options/\tfile-converters")
}
}
func BenchmarkHeaderEmpty(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
Header("")
}
}

View file

@ -48,7 +48,7 @@ func TestSearchQuery(t *testing.T) {
func BenchmarkSearchQuery_Complex(b *testing.B) {
s := "Jens AND Mander and me Or Kitty WITH flowers IN the park AT noon | img% json OR BILL!\n"
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = SearchQuery(s)
}
}
@ -56,7 +56,7 @@ func BenchmarkSearchQuery_Complex(b *testing.B) {
func BenchmarkSearchQuery_Short(b *testing.B) {
s := "cat and dog"
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = SearchQuery(s)
}
}
@ -65,7 +65,7 @@ func BenchmarkSearchQuery_LongNoOps(b *testing.B) {
// No tokens to replace, primarily tests normalization + trim.
s := strings.Repeat("alpha beta gamma ", 50)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = SearchQuery(s)
}
}

View file

@ -26,13 +26,13 @@ func TestUri(t *testing.T) {
}
func BenchmarkUri(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
Uri("https://docs.photoprism.app/getting-started/config-options/#file-converters")
}
}
func BenchmarkUriEmpty(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
Uri("")
}
}

View file

@ -233,7 +233,7 @@ var benchDir = flag.String("benchdir", runtime.GOROOT(), "The directory to scan
func BenchmarkFastWalk(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
err := fastwalk.Walk(*benchDir, func(path string, typ os.FileMode) error { return nil })
if err != nil {
b.Fatal(err)

View file

@ -27,13 +27,13 @@ func TestCacheControlMaxAge(t *testing.T) {
}
func BenchmarkTestCacheControlMaxAge(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
_ = CacheControlMaxAge(DurationYear, false)
}
}
func BenchmarkTestCacheControlMaxAgeImmutable(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
_ = CacheControlMaxAge(DurationYear, false) + ", " + CacheControlImmutable
}
}

View file

@ -33,7 +33,7 @@ func BenchmarkContainsAny_LargeOverlap(b *testing.B) {
bList[i] = a[i*4]
}
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
if !ContainsAny(a, bList) {
b.Fatalf("expected overlap")
}
@ -44,7 +44,7 @@ func BenchmarkContainsAny_Disjoint(b *testing.B) {
a := makeStrings("a", 5000)
bList := makeStrings("b", 5000)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
if ContainsAny(a, bList) {
b.Fatalf("expected disjoint")
}
@ -56,7 +56,7 @@ func BenchmarkJoin_Large(b *testing.B) {
j := append(makeStrings("y", 5000), a[:1000]...) // 1000 duplicates
j = shuffleEveryK(j, 7)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
out := Join(a, j)
if len(out) != 10000 {
b.Fatalf("unexpected length: %d", len(out))

View file

@ -166,7 +166,7 @@ func TestIsJoinToken(t *testing.T) {
}
func BenchmarkJoinToken(b *testing.B) {
for n := 0; n < b.N; n++ {
for b.Loop() {
JoinToken()
}
}

View file

@ -29,7 +29,7 @@ func TestClip(t *testing.T) {
func BenchmarkClipRunesASCII(b *testing.B) {
s := strings.Repeat("abc def ghi ", 20) // ASCII
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = Clip(s, 50)
}
}
@ -37,7 +37,7 @@ func BenchmarkClipRunesASCII(b *testing.B) {
func BenchmarkClipRunesUTF8(b *testing.B) {
s := strings.Repeat("Grüße 世", 20) // non-ASCII runes
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = Clip(s, 50)
}
}

View file

@ -115,7 +115,7 @@ func TestContainsAlnumLower(t *testing.T) {
func BenchmarkContainsNumber(b *testing.B) {
s := "The quick brown fox jumps over 13 lazy dogs"
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = ContainsNumber(s)
}
}
@ -123,7 +123,7 @@ func BenchmarkContainsNumber(b *testing.B) {
func BenchmarkSortCaseInsensitive(b *testing.B) {
words := []string{"Zebra", "apple", "Banana", "cherry", "Apricot", "banana", "zebra", "Cherry"}
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
w := append([]string(nil), words...)
SortCaseInsensitive(w)
}

View file

@ -46,7 +46,7 @@ func makeLargeText(distinct, repeats int) string {
func BenchmarkWords_Large(b *testing.B) {
s := makeLargeText(200, 200) // ~40k tokens mixed
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = Words(s)
}
}
@ -54,7 +54,7 @@ func BenchmarkWords_Large(b *testing.B) {
func BenchmarkUniqueKeywords_Large(b *testing.B) {
s := makeLargeText(200, 200)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = UniqueKeywords(s)
}
}
@ -62,7 +62,7 @@ func BenchmarkUniqueKeywords_Large(b *testing.B) {
func BenchmarkUniqueKeywords_ManyDup(b *testing.B) {
s := makeLargeText(20, 2000) // many repeats, few distinct
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for b.Loop() {
_ = UniqueKeywords(s)
}
}

File diff suppressed because one or more lines are too long