Config: Serve "robots.txt" file to control how site is indexed #4574

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer 2024-11-07 19:09:43 +01:00
parent 9b8086b91c
commit 7f02d4efdb
23 changed files with 194 additions and 100 deletions

View file

@ -85,8 +85,8 @@ func FlushCoverCache() {
}
// AddCacheHeader adds a cache control header to the response.
func AddCacheHeader(c *gin.Context, maxAge ttl.Duration, public bool) {
header.SetCacheControl(c, maxAge.Int(), public)
func AddCacheHeader(c *gin.Context, duration ttl.Duration, public bool) {
header.SetCacheControl(c, duration.Int(), public)
}
// AddCoverCacheHeader adds cover image cache control headers to the response.

View file

@ -31,7 +31,7 @@ func Shares(router *gin.RouterGroup) {
clientConfig := conf.ClientShare()
clientConfig.SiteUrl = clientConfig.SiteUrl + path.Join("s", token)
uri := conf.BaseUri("/library/albums")
uri := conf.LibraryUri("/albums")
c.HTML(http.StatusOK, "share.gohtml", gin.H{"shared": gin.H{"token": token, "uri": uri}, "config": clientConfig})
})
@ -62,7 +62,7 @@ func Shares(router *gin.RouterGroup) {
}
}
uri := conf.BaseUri(path.Join("/library/albums", uid, shared))
uri := conf.LibraryUri(path.Join("/albums", uid, shared))
c.HTML(http.StatusOK, "share.gohtml", gin.H{"shared": gin.H{"token": token, "uri": uri}, "config": clientConfig})
})

View file

@ -16,7 +16,7 @@ func TestConfig_ClientConfig(t *testing.T) {
result := c.ClientPublic()
assert.IsType(t, ClientConfig{}, result)
assert.Equal(t, AuthModePublic, result.AuthMode)
assert.Equal(t, "/library/browse", result.LoginUri)
assert.Equal(t, c.LibraryUri("/browse"), result.LoginUri)
assert.Equal(t, "", result.RegisterUri)
assert.Equal(t, 0, result.PasswordLength)
assert.Equal(t, "", result.PasswordResetUri)

View file

@ -129,11 +129,11 @@ func (c *Config) RegisterUri() string {
// LoginUri returns the user authentication page URI.
func (c *Config) LoginUri() string {
if c.Public() {
return c.BaseUri("/library/browse")
return c.LibraryUri("/browse")
}
if c.options.LoginUri == "" {
return c.BaseUri("/library/login")
return c.LibraryUri("/login")
}
return c.options.LoginUri

View file

@ -9,6 +9,9 @@ import (
// ApiUri defines the standard path for handling REST requests.
const ApiUri = "/api/v1"
// LibraryUri defines the path for user interface routes.
const LibraryUri = "/library"
// StaticUri defines the standard path for serving static content.
const StaticUri = "/static"

View file

@ -211,7 +211,7 @@ func (c *Config) TemplateExists(name string) bool {
}
}
// TemplateName returns the name of the default template (e.g. index.gohtml).
// TemplateName returns the name of the user interface bootstrap template.
func (c *Config) TemplateName() string {
if s := c.Settings(); s != nil {
if c.TemplateExists(s.Templates.Default) {

View file

@ -1,11 +1,20 @@
package config
import (
_ "embed"
"fmt"
"net/url"
"os"
"path/filepath"
"strings"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/fs"
)
//go:embed robots.txt
var robotsTxt []byte
const localhost = "localhost"
// BaseUri returns the site base URI for a given resource.
@ -28,6 +37,11 @@ func (c *Config) ApiUri() string {
return c.BaseUri(ApiUri)
}
// LibraryUri returns the user interface URI for the given resource.
func (c *Config) LibraryUri(res string) string {
return c.BaseUri(LibraryUri + res)
}
// ContentUri returns the content delivery URI based on the CdnUrl and the ApiUri.
func (c *Config) ContentUri() string {
return c.CdnUrl(c.ApiUri())
@ -148,3 +162,25 @@ func (c *Config) LegalUrl() string {
return c.options.LegalUrl
}
// RobotsTxt returns the content of the robots.txt file to be used for this site:
// https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt
func (c *Config) RobotsTxt() ([]byte, error) {
if c.Demo() && c.Public() {
// Allow public demo instances to be indexed.
return []byte(fmt.Sprintf("User-agent: *\nDisallow: /\nAllow: %s/\nAllow: %s/\nAllow: .js\nAllow: .css", LibraryUri, StaticUri)), nil
} else if c.Public() {
// Do not allow other instances to be indexed when public mode is enabled.
return robotsTxt, nil
} else if fileName := filepath.Join(c.ConfigPath(), "robots.txt"); !fs.FileExists(fileName) {
// Do not allow indexing if config/robots.txt does not exist.
return robotsTxt, nil
} else if robots, robotsErr := os.ReadFile(fileName); robotsErr != nil {
// Log error and do not allow indexing if config/robots.txt cannot be read.
log.Debugf("config: failed to read robots.txt file (%s)", clean.Error(robotsErr))
return robotsTxt, robotsErr
} else {
// Return content of the config/robots.txt file.
return robots, nil
}
}

View file

@ -39,6 +39,22 @@ func TestConfig_ApiUri(t *testing.T) {
assert.Equal(t, "/foo"+ApiUri, c.ApiUri())
}
func TestConfig_LibraryUri(t *testing.T) {
c := NewConfig(CliTestContext())
assert.Equal(t, "/library", c.LibraryUri(""))
assert.Equal(t, "/library/", c.LibraryUri("/"))
assert.Equal(t, "/library/browse", c.LibraryUri("/browse"))
c.options.SiteUrl = "http://superhost:2342/"
assert.Equal(t, "/library", c.LibraryUri(""))
assert.Equal(t, "/library/", c.LibraryUri("/"))
assert.Equal(t, "/library/browse", c.LibraryUri("/browse"))
c.options.SiteUrl = "http://foo:2342/foo/"
assert.Equal(t, "/foo/library", c.LibraryUri(""))
assert.Equal(t, "/foo/library/", c.LibraryUri("/"))
assert.Equal(t, "/foo/library/browse", c.LibraryUri("/browse"))
}
func TestConfig_ContentUri(t *testing.T) {
c := NewConfig(CliTestContext())
@ -151,3 +167,27 @@ func TestConfig_SiteDescription(t *testing.T) {
c.options.SiteDescription = ""
assert.Equal(t, "", c.SiteDescription())
}
func TestConfig_LegalInfo(t *testing.T) {
c := NewConfig(CliTestContext())
assert.Equal(t, "", c.LegalInfo())
assert.Equal(t, "", c.LegalUrl())
c.options.LegalInfo = "ACME Inc."
c.options.LegalUrl = "https://example.com/"
assert.Equal(t, c.options.LegalInfo, c.LegalInfo())
assert.Equal(t, c.options.LegalUrl, c.LegalUrl())
c.options.LegalInfo = ""
c.options.LegalUrl = ""
assert.Equal(t, "", c.LegalInfo())
assert.Equal(t, "", c.LegalUrl())
}
func TestConfig_RobotsTxt(t *testing.T) {
c := NewConfig(CliTestContext())
result, err := c.RobotsTxt()
assert.NoError(t, err)
assert.Equal(t, robotsTxt, result)
}

View file

@ -0,0 +1,3 @@
User-agent: *
User-agent: AdsBot-Google
Disallow: /

View file

@ -1,18 +0,0 @@
package server
import (
"github.com/gin-gonic/gin"
"github.com/photoprism/photoprism/internal/config"
"github.com/photoprism/photoprism/pkg/header"
)
// Robots is a middleware that adds a "X-Robots-Tag" header to the response:
// https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#xrobotstag
var Robots = func(conf *config.Config) gin.HandlerFunc {
return func(c *gin.Context) {
// Block search engines until a public picture wall has been implemented,
// see https://github.com/photoprism/photoprism/issues/669.
c.Header(header.Robots, header.RobotsNone)
}
}

View file

@ -19,10 +19,10 @@ func registerRoutes(router *gin.Engine, conf *config.Config) {
// Register static asset and templates routes.
registerStaticRoutes(router, conf)
// Register PWA bootstrap and config routes.
registerPWARoutes(router, conf)
// Register user interface routes.
registerWebAppRoutes(router, conf)
// Register built-in WebDAV server routes.
// Register WebDAV server routes.
registerWebDAVRoutes(router, conf)
// Register sharing routes starting with "/s".

View file

@ -8,11 +8,12 @@ import (
"github.com/photoprism/photoprism/internal/api"
"github.com/photoprism/photoprism/internal/config"
"github.com/photoprism/photoprism/pkg/header"
)
// registerStaticRoutes adds routes for serving static content and templates.
func registerStaticRoutes(router *gin.Engine, conf *config.Config) {
// Redirects to the PWA for now, can be replaced by a template later.
// Redirects to the login page.
login := func(c *gin.Context) {
if conf.OIDCEnabled() && conf.OIDCRedirect() {
c.Redirect(http.StatusTemporaryRedirect, conf.OIDCLoginUri())
@ -21,6 +22,20 @@ func registerStaticRoutes(router *gin.Engine, conf *config.Config) {
}
}
// Control how crawlers index the site by serving a "robots.txt" file in addition
// to the "X-Robots-Tag" response header set in the Security middleware:
// https://developers.google.com/search/docs/crawling-indexing/robots/create-robots-txt
router.Any(conf.BaseUri("/robots.txt"), func(c *gin.Context) {
if robotsTxt, _ := conf.RobotsTxt(); len(robotsTxt) == 0 {
// Return error 404 if file cannot be read or is empty.
c.Data(http.StatusNotFound, header.ContentTypeText, []byte{})
} else {
// Allow clients to cache the response for one day.
c.Header(header.CacheControl, header.CacheControlMaxAge(header.DurationDay, true))
c.Data(http.StatusOK, header.ContentTypeText, robotsTxt)
}
})
router.Any(conf.BaseUri("/"), login)
// Shows "Page Not found" error if no other handler is registered.

View file

@ -40,7 +40,7 @@ func TestStaticRoutes(t *testing.T) {
})
}
func TestPWARoutes(t *testing.T) {
func TestWebAppRoutes(t *testing.T) {
// Create router.
r := gin.Default()
@ -50,39 +50,37 @@ func TestPWARoutes(t *testing.T) {
// Find and load templates.
r.LoadHTMLFiles(conf.TemplateFiles()...)
// Register routes.
registerPWARoutes(r, conf)
// Register user interface routes.
registerWebAppRoutes(r, conf)
// Bootstrapping.
t.Run("GetLibrary", func(t *testing.T) {
w := httptest.NewRecorder()
req, _ := http.NewRequest("GET", "/library/", nil)
req, _ := http.NewRequest("GET", conf.LibraryUri("/"), nil)
r.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)
assert.NotEmpty(t, w.Body)
})
t.Run("HeadLibrary", func(t *testing.T) {
w := httptest.NewRecorder()
req, _ := http.NewRequest("HEAD", "/library/", nil)
req, _ := http.NewRequest("HEAD", conf.LibraryUri("/"), nil)
r.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)
assert.NotEmpty(t, w.Body)
})
t.Run("GetLibraryBrowse", func(t *testing.T) {
w := httptest.NewRecorder()
req, _ := http.NewRequest("GET", "/library/browse", nil)
req, _ := http.NewRequest("GET", conf.LibraryUri("/browse"), nil)
r.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)
assert.NotEmpty(t, w.Body)
})
t.Run("HeadLibraryBrowse", func(t *testing.T) {
w := httptest.NewRecorder()
req, _ := http.NewRequest("HEAD", "/library/browse", nil)
req, _ := http.NewRequest("HEAD", conf.LibraryUri("/browse"), nil)
r.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)
})
// Manifest.
t.Run("GetManifest", func(t *testing.T) {
w := httptest.NewRecorder()
req, _ := http.NewRequest("GET", "/manifest.json", nil)
@ -95,8 +93,6 @@ func TestPWARoutes(t *testing.T) {
assert.True(t, strings.Contains(manifest, `"start_url": "/library/",`))
assert.True(t, strings.Contains(manifest, "/static/icons/logo/128.png"))
})
// Service worker.
t.Run("GetServiceWorker", func(t *testing.T) {
w := httptest.NewRecorder()
req, _ := http.NewRequest("GET", "/sw.js", nil)

View file

@ -11,25 +11,28 @@ import (
"github.com/photoprism/photoprism/pkg/header"
)
// registerPWARoutes adds routes for bootstrapping and configuring the progressive web app user interface.
func registerPWARoutes(router *gin.Engine, conf *config.Config) {
// Loads Progressive Web App (PWA) on all routes beginning with "library".
pwa := func(c *gin.Context) {
// registerWebAppRoutes adds routes for the web user interface.
func registerWebAppRoutes(router *gin.Engine, conf *config.Config) {
// Serve user interface bootstrap template on all routes starting with "/library".
ui := func(c *gin.Context) {
// Prevent CDNs from caching this endpoint.
if header.IsCdn(c.Request) {
api.AbortNotFound(c)
return
}
// Set values for UI bootstrap template.
values := gin.H{
"signUp": config.SignUp,
"config": conf.ClientPublic(),
}
// Render UI bootstrap template.
c.HTML(http.StatusOK, conf.TemplateName(), values)
}
router.Any(conf.BaseUri("/library/*path"), pwa)
router.Any(conf.LibraryUri("/*path"), ui)
// Progressive Web App (PWA) Manifest.
// Serve the user interface manifest file.
manifest := func(c *gin.Context) {
c.Header(header.CacheControl, header.CacheControlNoStore)
c.Header(header.ContentType, header.ContentTypeJsonUtf8)
@ -37,7 +40,7 @@ func registerPWARoutes(router *gin.Engine, conf *config.Config) {
}
router.Any(conf.BaseUri("/manifest.json"), manifest)
// Progressive Web App (PWA) Service Worker.
// Serve user interface service worker file.
swWorker := func(c *gin.Context) {
c.Header(header.CacheControl, header.CacheControlNoStore)
c.File(filepath.Join(conf.BuildPath(), "sw.js"))

View file

@ -11,18 +11,26 @@ import (
// Security is a middleware that adds security-related headers to the server's response.
var Security = func(conf *config.Config) gin.HandlerFunc {
return func(c *gin.Context) {
// Abort if the request should not be served through a CDN.
// Only allow crawlers to index the site if it is a public demo (or if there is a public image wall):
// https://github.com/photoprism/photoprism/issues/669
if !conf.Demo() || !conf.Public() {
// Set "X-Robots-Tag" header:
// https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#xrobotstag
c.Header(header.RobotsTag, header.RobotsNone)
}
// Abort if the request must not be served through a CDN.
if header.AbortCdnRequest(c.Request) {
api.AbortNotFound(c)
return
}
// Set Content Security Policy.
// See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy
// Set "Content-Security-Policy" header:
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Security-Policy
c.Header(header.ContentSecurityPolicy, header.DefaultContentSecurityPolicy)
// Set Frame Options.
// See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
// Set "X-Frame-Options" header:
// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Frame-Options
c.Header(header.FrameOptions, header.DefaultFrameOptions)
}
}

View file

@ -78,9 +78,6 @@ func Start(ctx context.Context, conf *config.Config) {
// Register security middleware.
router.Use(Security(conf))
// Register robots tag middleware.
router.Use(Robots(conf))
// Create REST API router group.
APIv1 = router.Group(conf.BaseUri(config.ApiUri), Api(conf))

View file

@ -44,19 +44,19 @@ var (
)
// CacheControlMaxAge returns a CacheControl header value based on the specified
// maxAge time in seconds or the defaults if maxAge is not a positive number.
func CacheControlMaxAge(maxAge int, public bool) string {
if maxAge < 0 {
// duration in seconds or the defaults if duration is not a positive number.
func CacheControlMaxAge(duration int, public bool) string {
if duration < 0 {
return CacheControlNoCache
} else if maxAge > CacheYearInt {
maxAge = CacheYearInt
} else if duration > DurationYear {
duration = DurationYear
}
switch {
case maxAge > 0 && public:
return "public, max-age=" + strconv.Itoa(maxAge)
case maxAge > 0:
return "private, max-age=" + strconv.Itoa(maxAge)
case duration > 0 && public:
return "public, max-age=" + strconv.Itoa(duration)
case duration > 0:
return "private, max-age=" + strconv.Itoa(duration)
case public:
return CacheControlPublicDefault
default:
@ -66,14 +66,14 @@ func CacheControlMaxAge(maxAge int, public bool) string {
// SetCacheControl adds a CacheControl header to the response based on the specified parameters.
// If maxAge is 0, the defaults will be used.
func SetCacheControl(c *gin.Context, maxAge int, public bool) {
func SetCacheControl(c *gin.Context, duration int, public bool) {
if c == nil {
return
} else if c.Writer == nil {
return
}
c.Header(CacheControl, CacheControlMaxAge(maxAge, public))
c.Header(CacheControl, CacheControlMaxAge(duration, public))
}
// SetCacheControlImmutable adds a CacheControl header to the response based on the specified parameters

View file

@ -1,22 +0,0 @@
package header
// CacheMinute is one minute in seconds.
const CacheMinute int64 = 60
// CacheHour is one hour in seconds.
const CacheHour = CacheMinute * 60
// CacheDay is one day in seconds.
const CacheDay = CacheHour * 24
// CacheWeek is one week in seconds.
const CacheWeek = CacheDay * 7
// CacheMonth is about one month in seconds.
const CacheMonth = CacheDay * 31
// CacheYear is 365 days in seconds.
const CacheYear = CacheDay * 365
// CacheYearInt is CacheYear specified as integer.
const CacheYearInt = int(CacheYear)

View file

@ -14,27 +14,27 @@ func TestCacheControlMaxAge(t *testing.T) {
assert.Equal(t, CacheControlPrivateDefault, CacheControlMaxAge(0, false))
assert.Equal(t, "no-cache", CacheControlMaxAge(-1, false))
assert.Equal(t, "private, max-age=1", CacheControlMaxAge(1, false))
assert.Equal(t, "private, max-age=31536000", CacheControlMaxAge(CacheYearInt, false))
assert.Equal(t, "private, max-age=31536000", CacheControlMaxAge(DurationYear, false))
assert.Equal(t, "private, max-age=31536000", CacheControlMaxAge(1231536000, false))
})
t.Run("Public", func(t *testing.T) {
assert.Equal(t, CacheControlPublicDefault, CacheControlMaxAge(0, true))
assert.Equal(t, "no-cache", CacheControlMaxAge(-1, true))
assert.Equal(t, "public, max-age=1", CacheControlMaxAge(1, true))
assert.Equal(t, "public, max-age=31536000", CacheControlMaxAge(CacheYearInt, true))
assert.Equal(t, "public, max-age=31536000", CacheControlMaxAge(DurationYear, true))
assert.Equal(t, "public, max-age=31536000", CacheControlMaxAge(1231536000, true))
})
}
func BenchmarkTestCacheControlMaxAge(b *testing.B) {
for n := 0; n < b.N; n++ {
_ = CacheControlMaxAge(CacheYearInt, false)
_ = CacheControlMaxAge(DurationYear, false)
}
}
func BenchmarkTestCacheControlMaxAgeImmutable(b *testing.B) {
for n := 0; n < b.N; n++ {
_ = CacheControlMaxAge(CacheYearInt, false) + ", " + CacheControlImmutable
_ = CacheControlMaxAge(DurationYear, false) + ", " + CacheControlImmutable
}
}
@ -47,7 +47,7 @@ func TestSetCacheControl(t *testing.T) {
Header: make(http.Header),
}
SetCacheControl(c, CacheYearInt, false)
SetCacheControl(c, DurationYear, false)
assert.Equal(t, "private, max-age=31536000", c.Writer.Header().Get(CacheControl))
})
t.Run("Public", func(t *testing.T) {
@ -58,7 +58,7 @@ func TestSetCacheControl(t *testing.T) {
Header: make(http.Header),
}
SetCacheControl(c, CacheYearInt, true)
SetCacheControl(c, DurationYear, true)
assert.Equal(t, "public, max-age=31536000", c.Writer.Header().Get(CacheControl))
})
t.Run("NoCache", func(t *testing.T) {
@ -83,7 +83,7 @@ func TestSetCacheControlImmutable(t *testing.T) {
Header: make(http.Header),
}
SetCacheControlImmutable(c, CacheYearInt, false)
SetCacheControlImmutable(c, DurationYear, false)
assert.Equal(t, "private, max-age=31536000, immutable", c.Writer.Header().Get(CacheControl))
})
t.Run("Public", func(t *testing.T) {
@ -94,7 +94,7 @@ func TestSetCacheControlImmutable(t *testing.T) {
Header: make(http.Header),
}
SetCacheControlImmutable(c, CacheYearInt, true)
SetCacheControlImmutable(c, DurationYear, true)
assert.Equal(t, "public, max-age=31536000, immutable", c.Writer.Header().Get(CacheControl))
})
t.Run("PublicDefault", func(t *testing.T) {

11
pkg/header/duration.go Normal file
View file

@ -0,0 +1,11 @@
package header
// Durations in seconds, e.g. to set a maximum cache age.
const (
DurationMinute int = 60
DurationHour = DurationMinute * 60 // One hour in seconds
DurationDay = DurationHour * 24 // One day in seconds
DurationWeek = DurationDay * 7 // One week in seconds
DurationMonth = DurationDay * 31 // About one month in seconds
DurationYear = DurationDay * 365 // 365 days in seconds
)

View file

@ -0,0 +1,22 @@
package header
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
)
func TestDuration(t *testing.T) {
var (
day = time.Hour * 24
week = day * 7
month = day * 31
year = day * 365
)
assert.Equal(t, int(day.Seconds()), DurationDay)
assert.Equal(t, int(week.Seconds()), DurationWeek)
assert.Equal(t, int(month.Seconds()), DurationMonth)
assert.Equal(t, int(year.Seconds()), DurationYear)
}

View file

@ -2,10 +2,10 @@ package header
type RobotsRule = string
// Robots controls how pages are indexed and crawled by search engines:
// RobotsTag controls how pages are indexed and crawled by search engines:
// https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#xrobotstag
const (
Robots = "X-Robots-Tag"
RobotsTag = "X-Robots-Tag"
)
// Standard Robots header values.

View file

@ -8,7 +8,7 @@ import (
func TestRobots(t *testing.T) {
t.Run("Header", func(t *testing.T) {
assert.Equal(t, "X-Robots-Tag", Robots)
assert.Equal(t, "X-Robots-Tag", RobotsTag)
})
t.Run("Values", func(t *testing.T) {
assert.Equal(t, "all", RobotsAll)