Harden preview enrichment and recommendation metadata
build-push / docker (push) Has been cancelled

This commit is contained in:
AI Assistant
2026-03-16 16:39:09 +09:00
parent 93b9f571ab
commit 2064825d29
7 changed files with 433 additions and 106 deletions
+244 -55
View File
@@ -294,31 +294,16 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
extractMetaContent(html, "twitter:image"),
extractJSONLDValue(html, "thumbnailUrl"),
)
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
if hasUsableThumbnail(pageThumbnail) && shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
result.ThumbnailURL = pageThumbnail
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = firstNonEmpty(
videoMeta.ContentURL,
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractVideoPreviewURL(html),
extractEnvatoPreviewFromHydration(html),
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
)
result.PreviewVideoURL = collectEnvatoPreviewURL(html, pageThumbnail, result.ThumbnailURL, videoMeta.ContentURL)
}
if result.PreviewVideoURL == "" {
time.Sleep(1200 * time.Millisecond)
if retryHTML, retryErr := s.fetchText(result.Link); retryErr == nil {
result.PreviewVideoURL = firstNonEmpty(
extractJSONLDValue(retryHTML, "contentUrl"),
extractMetaContent(retryHTML, "twitter:player:stream"),
extractVideoPreviewURL(retryHTML),
extractEnvatoPreviewFromHydration(retryHTML),
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
)
result.PreviewVideoURL = collectEnvatoPreviewURL(retryHTML, pageThumbnail, result.ThumbnailURL, "")
}
}
s.debug("search_service:enrich_envato_done", map[string]any{
@@ -341,8 +326,8 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
body, err := s.fetchJSONText(apiURL)
if err == nil {
urls := collectURLs(body)
if result.ThumbnailURL == "" {
result.ThumbnailURL = pickImageURL(urls)
if !hasUsableThumbnail(result.ThumbnailURL) {
result.ThumbnailURL = pickArtgridImageURL(urls, clipID)
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = pickVideoURL(urls)
@@ -356,12 +341,16 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
html, err := s.fetchText(result.Link)
if err == nil {
if !isMatchingArtgridClipPage(html, clipID) {
s.debug("search_service:enrich_artgrid_html_mismatch", map[string]any{"link": result.Link, "clipId": clipID})
s.debug("search_service:enrich_artgrid_html_mismatch", map[string]any{
"link": result.Link,
"clipId": clipID,
"signals": artgridHTMLSignals(html, clipID),
})
return result
}
result.Title = firstNonEmpty(
cleanArtgridTitle(extractMetaContent(html, "og:title")),
cleanArtgridTitle(extractMetaContent(html, "title")),
cleanArtgridTitle(extractHTMLTitle(html)),
result.Title,
)
result.Snippet = firstNonEmpty(
@@ -374,15 +363,20 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
extractMetaContent(html, "twitter:image"),
extractArtgridBackgroundThumbnail(html, clipID),
extractJSONLDValue(html, "image"),
pickArtgridImageURL(collectURLs(html), clipID),
)
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
if hasUsableThumbnail(pageThumbnail) && shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
result.ThumbnailURL = pageThumbnail
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = firstNonEmpty(
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractMetaContent(html, "og:video"),
extractMetaContent(html, "og:video:url"),
extractMetaContent(html, "og:video:secure_url"),
extractVideoPreviewURL(html),
pickVideoURL(collectURLs(html)),
)
}
if result.PreviewVideoURL == "" {
@@ -677,6 +671,112 @@ func deriveThumbnail(link string) string {
return ""
}
func isLowValueThumbnail(raw string) bool {
lower := strings.ToLower(strings.TrimSpace(raw))
if lower == "" {
return true
}
for _, token := range []string{
"favicon", "apple-touch-icon", "/logo", "/icon", "icon.", "logo.", "placehold.co",
} {
if strings.Contains(lower, token) {
return true
}
}
for _, host := range []string{
"googleusercontent.com", "gstatic.com", "bing.com", "duckduckgo.com", "icons.duckduckgo.com",
} {
if strings.Contains(lower, host) && !strings.Contains(lower, "ytimg.com") {
return true
}
}
return false
}
func hasUsableThumbnail(raw string) bool {
return strings.TrimSpace(raw) != "" && !isLowValueThumbnail(raw)
}
func HasUsableThumbnail(raw string) bool {
return hasUsableThumbnail(raw)
}
func IsLowValueThumbnail(raw string) bool {
return isLowValueThumbnail(raw)
}
func buildEmbedURL(source, link string) string {
trimmed := strings.TrimSpace(link)
if trimmed == "" {
return ""
}
if strings.EqualFold(strings.TrimSpace(source), "Google Video") {
if videoID := extractYouTubeID(trimmed); videoID != "" {
return "https://www.youtube-nocookie.com/embed/" + videoID + "?autoplay=1&rel=0&playsinline=1&modestbranding=1&enablejsapi=1"
}
}
return trimmed
}
func defaultMediaMode(source, link, previewURL, thumbnailURL string) (string, string, string) {
embedURL := buildEmbedURL(source, link)
switch source {
case "Google Video":
if embedURL != "" {
return "embed", embedURL, ""
}
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", "", "missing_google_embed"
}
return "none", "", "missing_google_embed"
case "Envato":
if strings.TrimSpace(previewURL) != "" {
return "preview_video", embedURL, "provider_embed_blocked"
}
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, "provider_embed_blocked"
}
if embedURL != "" {
return "embed", embedURL, ""
}
return "none", "", "provider_embed_blocked"
case "Artgrid":
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, "provider_preview_unavailable"
}
if strings.TrimSpace(previewURL) != "" {
return "preview_video", embedURL, "provider_preview_unavailable"
}
if embedURL != "" {
return "embed", embedURL, ""
}
return "none", "", "provider_preview_unavailable"
default:
if strings.TrimSpace(previewURL) != "" {
return "preview_video", embedURL, ""
}
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, ""
}
if embedURL != "" {
return "embed", embedURL, ""
}
return "none", "", ""
}
}
func DecorateRecommendationMedia(item AIRecommendation) AIRecommendation {
item.EmbedURL = buildEmbedURL(item.Source, item.Link)
item.MediaMode, _, item.PreviewBlockedReason = defaultMediaMode(item.Source, item.Link, item.PreviewVideoURL, item.ThumbnailURL)
if item.MediaMode == "embed" && item.EmbedURL == "" {
item.MediaMode = "none"
}
if item.MediaMode == "thumbnail" && !hasUsableThumbnail(item.ThumbnailURL) && strings.TrimSpace(item.PreviewVideoURL) != "" {
item.MediaMode = "preview_video"
}
return item
}
func extractYouTubeID(link string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?:v=|\/shorts\/|\/embed\/)([A-Za-z0-9_-]{11})`),
@@ -705,6 +805,15 @@ func extractMetaContent(html, property string) string {
return ""
}
func extractHTMLTitle(html string) string {
pattern := regexp.MustCompile(`(?is)<title[^>]*>(.*?)</title>`)
matches := pattern.FindStringSubmatch(html)
if len(matches) == 2 {
return htmlUnescape(strings.TrimSpace(matches[1]))
}
return ""
}
func extractVideoPreviewURL(html string) string {
normalizedHTML := strings.ReplaceAll(html, `\\\/`, `/`)
normalizedHTML = strings.ReplaceAll(normalizedHTML, `\/`, `/`)
@@ -737,6 +846,19 @@ func extractArtgridBackgroundThumbnail(html, clipID string) string {
return ""
}
func pickArtgridImageURL(urls []string, clipID string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if !(strings.Contains(lower, ".jpg") || strings.Contains(lower, ".jpeg") || strings.Contains(lower, ".png") || strings.Contains(lower, ".webp")) {
continue
}
if strings.Contains(item, clipID) || strings.Contains(lower, "graded-thumbnail") || strings.Contains(lower, "imgix") {
return item
}
}
return pickImageURL(urls)
}
func extractArtgridClipID(link string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`/clip/([0-9]+)/`),
@@ -840,30 +962,38 @@ func (s *SearchService) fetchText(target string) (string, error) {
s.debug("search_service:fetch_cache_hit", map[string]any{"type": "html", "target": target, "bytes": len(cached)})
return cached, nil
}
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
for _, strategy := range []string{"default", "provider"} {
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", strategy)
if err != nil {
return "", err
}
s.debug("search_service:envato_fetch_strategy", map[string]any{"target": target, "strategy": strategy})
resp, err := s.Client.Do(req)
if err != nil {
continue
}
data, readErr := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
_ = resp.Body.Close()
if readErr != nil {
continue
}
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
continue
}
if resp.StatusCode >= 300 {
continue
}
if looksLikeCloudflareChallenge(string(data)) {
continue
}
body := string(data)
s.setCachedFetchResult(cacheKey, body, 3*time.Minute)
return body, nil
}
body, err := fetchTextViaPython(target)
if err != nil {
return "", err
}
resp, err := s.Client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
return fetchTextViaPython(target)
}
if resp.StatusCode >= 300 {
return "", fmt.Errorf("fetch returned status %d", resp.StatusCode)
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
if err != nil {
return "", err
}
if looksLikeCloudflareChallenge(string(data)) {
return fetchTextViaPython(target)
}
body := string(data)
s.setCachedFetchResult(cacheKey, body, 3*time.Minute)
return body, nil
}
@@ -875,7 +1005,7 @@ func (s *SearchService) fetchJSONText(target string) (string, error) {
return cached, nil
}
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*")
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*", "provider")
if err != nil {
return "", err
}
@@ -1034,19 +1164,35 @@ func cleanArtgridDescription(description string) string {
return strings.TrimSpace(description)
}
func artgridHTMLSignals(html, clipID string) map[string]bool {
ogURL := extractMetaContent(html, "og:url")
canonical := extractCanonicalURL(html)
alWebURL := extractMetaContent(html, "al:web:url")
lowerHTML := strings.ToLower(html)
title := strings.ToLower(extractHTMLTitle(html))
ogImage := strings.ToLower(extractMetaContent(html, "og:image"))
twitterImage := strings.ToLower(extractMetaContent(html, "twitter:image"))
return map[string]bool{
"og_url_clip": strings.Contains(ogURL, clipID),
"canonical_clip": strings.Contains(canonical, clipID),
"al_web_clip": strings.Contains(alWebURL, clipID),
"body_main_clipvideo": strings.Contains(lowerHTML, "main-clipvideo_"+clipID),
"body_clip_path": strings.Contains(lowerHTML, "/clip/"+clipID+"/"),
"body_clip_id": strings.Contains(lowerHTML, clipID),
"title_mentions_clip": strings.Contains(title, "artgrid") || strings.Contains(title, "artlist"),
"image_clip": strings.Contains(ogImage, strings.ToLower(clipID)) || strings.Contains(twitterImage, strings.ToLower(clipID)),
}
}
func isMatchingArtgridClipPage(html, clipID string) bool {
if clipID == "" {
return false
}
ogURL := extractMetaContent(html, "og:url")
canonical := extractCanonicalURL(html)
lowerHTML := strings.ToLower(html)
for _, candidate := range []string{ogURL, canonical} {
if strings.Contains(candidate, clipID) {
return true
}
signals := artgridHTMLSignals(html, clipID)
if signals["og_url_clip"] || signals["canonical_clip"] || signals["al_web_clip"] || signals["body_main_clipvideo"] || signals["body_clip_path"] || signals["image_clip"] {
return true
}
if strings.Contains(lowerHTML, "main-clipvideo_"+clipID) || strings.Contains(lowerHTML, "/clip/"+clipID+"/") {
if signals["body_clip_id"] && signals["title_mentions_clip"] {
return true
}
return false
@@ -1090,6 +1236,23 @@ func extractEnvatoPreviewFromHydration(html string) string {
return firstNonEmpty(pickBestEnvatoPreviewURL(urls), extractVideoPreviewURL(string(decoded)))
}
func collectEnvatoPreviewURL(html, pageThumbnail, currentThumbnail, contentURL string) string {
urls := collectURLs(html)
return firstNonEmpty(
contentURL,
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractMetaContent(html, "og:video"),
extractMetaContent(html, "og:video:url"),
extractMetaContent(html, "og:video:secure_url"),
extractEnvatoPreviewFromHydration(html),
pickBestEnvatoPreviewURL(urls),
extractVideoPreviewURL(html),
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
deriveEnvatoPreviewFromThumbnail(currentThumbnail),
)
}
func extractWindowAssignedValue(html, variable string) string {
pattern := regexp.MustCompile(`window\.` + regexp.QuoteMeta(variable) + `\s*=\s*"([^"]+)"`)
matches := pattern.FindStringSubmatch(html)
@@ -1121,7 +1284,19 @@ func pickBestEnvatoPreviewURL(urls []string) string {
return ""
}
func newBrowserRequest(method, target, accept string) (*http.Request, error) {
func inferFetchReferer(target string) string {
lower := strings.ToLower(target)
switch {
case strings.Contains(lower, "envatousercontent.com"), strings.Contains(lower, "elements.envato.com"):
return "https://elements.envato.com/"
case strings.Contains(lower, "artgrid"), strings.Contains(lower, "artlist"):
return "https://artgrid.io/"
default:
return ""
}
}
func newBrowserRequest(method, target, accept, strategy string) (*http.Request, error) {
req, err := http.NewRequest(method, target, nil)
if err != nil {
return nil, err
@@ -1131,6 +1306,14 @@ func newBrowserRequest(method, target, accept string) (*http.Request, error) {
if accept != "" {
req.Header.Set("Accept", accept)
}
if strategy == "provider" {
req.Header.Set("Referer", inferFetchReferer(target))
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("Sec-Fetch-Dest", "document")
req.Header.Set("Sec-Fetch-Mode", "navigate")
req.Header.Set("Sec-Fetch-Site", "none")
req.Header.Set("Sec-Fetch-User", "?1")
}
return req, nil
}
@@ -1142,11 +1325,17 @@ req = Request(sys.argv[1], headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Referer": sys.argv[2],
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
})
with urlopen(req, timeout=20) as resp:
sys.stdout.buffer.write(resp.read(1024 * 1024))
`
output, err := exec.Command("python3", "-c", script, target).CombinedOutput()
output, err := exec.Command("python3", "-c", script, target, inferFetchReferer(target)).CombinedOutput()
if err != nil {
return "", fmt.Errorf("python fallback failed: %v: %s", err, truncateBytes(output, 300))
}