Stabilize search pipeline and improve preview diagnostics
build-push / docker (push) Successful in 4m14s

This commit is contained in:
AI Assistant
2026-03-13 18:32:54 +09:00
parent 6f3149a443
commit 7dfb1ad2de
8 changed files with 463 additions and 45 deletions
+178 -21
View File
@@ -6,6 +6,7 @@ import (
"io"
"net/http"
"net/url"
"os/exec"
"regexp"
"sort"
"strings"
@@ -54,6 +55,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
name string
categories string
engine string
maxResults int
build func(string) []string
accept func(SearchResult) bool
}
@@ -63,6 +65,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
name: "Envato",
categories: "general",
engine: s.WebEngine,
maxResults: 8,
build: buildEnvatoQueries,
accept: isRenderableEnvatoResult,
},
@@ -70,6 +73,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
name: "Artgrid",
categories: "general",
engine: s.WebEngine,
maxResults: 8,
build: buildArtgridQueries,
accept: isRenderableArtgridResult,
},
@@ -77,16 +81,18 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
name: "Google Video",
categories: "videos",
engine: s.GoogleVideoEngine,
maxResults: 6,
build: buildGoogleVideoQueries,
accept: isUsefulGoogleVideoResult,
},
}
seen := map[string]bool{}
sourceCounts := map[string]int{}
results := make([]SearchResult, 0, 90)
var lastErr error
baseQueries := limitQueries(queries, 5)
baseQueries := limitQueries(queries, 3)
for _, base := range baseQueries {
base = strings.TrimSpace(base)
if base == "" {
@@ -96,7 +102,13 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
if len(enabledPlatforms) > 0 && !enabledPlatforms[strings.ToLower(source.name)] {
continue
}
if sourceCounts[source.name] >= source.maxResults {
continue
}
for _, searchQuery := range source.build(base) {
if sourceCounts[source.name] >= source.maxResults {
break
}
items, err := s.search(searchQuery, source.categories, source.engine, source.name)
if err != nil {
lastErr = err
@@ -112,6 +124,10 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
}
seen[item.Link] = true
results = append(results, item)
sourceCounts[source.name]++
if sourceCounts[source.name] >= source.maxResults {
break
}
}
}
}
@@ -128,7 +144,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
}
func (s *SearchService) EnrichResults(results []SearchResult) []SearchResult {
limit := minInt(len(results), 24)
limit := minInt(len(results), 18)
if limit == 0 {
return results
}
@@ -170,14 +186,32 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
if err != nil {
return result
}
if result.ThumbnailURL == "" {
result.ThumbnailURL = firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
)
result.Title = firstNonEmpty(
extractMetaContent(html, "og:title"),
result.Title,
)
result.Snippet = firstNonEmpty(
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
extractJSONLDValue(html, "thumbnailUrl"),
)
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
result.ThumbnailURL = pageThumbnail
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = extractVideoPreviewURL(html)
result.PreviewVideoURL = firstNonEmpty(
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractVideoPreviewURL(html),
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
)
}
return result
}
@@ -203,17 +237,30 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
if result.ThumbnailURL == "" || result.PreviewVideoURL == "" {
html, err := s.fetchText(result.Link)
if err == nil {
if result.ThumbnailURL == "" {
result.ThumbnailURL = firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
)
if result.ThumbnailURL == "" {
result.ThumbnailURL = extractArtgridBackgroundThumbnail(html, clipID)
}
result.Title = firstNonEmpty(
extractMetaContent(html, "og:title"),
result.Title,
)
result.Snippet = firstNonEmpty(
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
extractArtgridBackgroundThumbnail(html, clipID),
extractJSONLDValue(html, "image"),
)
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
result.ThumbnailURL = pageThumbnail
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = extractVideoPreviewURL(html)
result.PreviewVideoURL = firstNonEmpty(
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractVideoPreviewURL(html),
)
}
}
}
@@ -282,7 +329,6 @@ func (s *SearchService) search(query, categories, engine, source string) ([]Sear
func buildGoogleVideoQueries(base string) []string {
return []string{
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR "establishing shot" OR editorial) -tutorial -"how to" -review -reaction -course -podcast -vlog -interview -breakdown -edit -editing`, base),
fmt.Sprintf(`"%s" ("cinematic footage" OR "free stock footage" OR "4k footage") -tutorial -"how to" -review`, base),
}
}
@@ -310,6 +356,8 @@ func isUsefulGoogleVideoResult(result SearchResult) bool {
"tutorial", "how to", "review", "reaction", "podcast", "interview", "walkthrough",
"course", "lesson", "edit tutorial", "editing tutorial", "premiere pro", "after effects",
"breakdown", "explained", "vlog", "tips", "guide", "learn", "free download",
"bgm", "music", "song", "lyrics", "audio", "soundtrack", "trailer", "teaser",
"full movie", "movie clip", "status", "whatsapp status", "fan cam", "fancam",
} {
if strings.Contains(text, banned) {
return false
@@ -477,11 +525,18 @@ func pickVideoURL(urls []string) string {
}
func (s *SearchService) fetchText(target string) (string, error) {
resp, err := s.Client.Get(target)
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
if err != nil {
return "", err
}
resp, err := s.Client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
return fetchTextViaPython(target)
}
if resp.StatusCode >= 300 {
return "", fmt.Errorf("fetch returned status %d", resp.StatusCode)
}
@@ -489,15 +544,17 @@ func (s *SearchService) fetchText(target string) (string, error) {
if err != nil {
return "", err
}
if looksLikeCloudflareChallenge(string(data)) {
return fetchTextViaPython(target)
}
return string(data), nil
}
func (s *SearchService) fetchJSONText(target string) (string, error) {
req, err := http.NewRequest(http.MethodGet, target, nil)
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*")
if err != nil {
return "", err
}
req.Header.Set("Accept", "application/json, text/json")
resp, err := s.Client.Do(req)
if err != nil {
return "", err
@@ -522,6 +579,106 @@ func firstNonEmpty(values ...string) string {
return ""
}
func shouldPreferPageThumbnail(current, pageLink string) bool {
current = strings.TrimSpace(current)
if current == "" {
return true
}
lower := strings.ToLower(current)
if strings.Contains(lower, "imgs.search.brave.com") || strings.Contains(lower, "googleusercontent.com") || strings.Contains(lower, "bing.com") {
return true
}
currentHost := hostOf(current)
pageHost := hostOf(pageLink)
return currentHost == "" || (pageHost != "" && currentHost != pageHost)
}
func hostOf(raw string) string {
parsed, err := url.Parse(raw)
if err != nil {
return ""
}
return strings.ToLower(parsed.Host)
}
func extractJSONLDValue(html, key string) string {
pattern := regexp.MustCompile(`"` + regexp.QuoteMeta(key) + `"\s*:\s*"(https?:\\?/\\?/[^"]+|[^"]+)"`)
matches := pattern.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) != 2 {
continue
}
value := strings.ReplaceAll(match[1], `\/`, `/`)
value = strings.ReplaceAll(value, `\u002F`, `/`)
value = strings.ReplaceAll(value, `\\`, "")
value = htmlUnescape(value)
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
if candidate == "" {
return ""
}
candidate = strings.ReplaceAll(candidate, "&", "&")
if strings.Contains(candidate, "/video_preview/") {
if idx := strings.Index(candidate, "?"); idx >= 0 {
candidate = candidate[:idx]
}
return regexp.MustCompile(`/video_preview/[^/]+\.(?:jpg|jpeg|png|webp)$`).ReplaceAllString(candidate, `/watermarked_preview/watermarked_preview.mp4`)
}
return ""
}
func newBrowserRequest(method, target, accept string) (*http.Request, error) {
req, err := http.NewRequest(method, target, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if accept != "" {
req.Header.Set("Accept", accept)
}
return req, nil
}
func fetchTextViaPython(target string) (string, error) {
script := `
from urllib.request import Request, urlopen
import sys
req = Request(sys.argv[1], headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
})
with urlopen(req, timeout=20) as resp:
sys.stdout.buffer.write(resp.read(1024 * 1024))
`
output, err := exec.Command("python3", "-c", script, target).CombinedOutput()
if err != nil {
return "", fmt.Errorf("python fallback failed: %v: %s", err, truncateBytes(output, 300))
}
return string(output), nil
}
func looksLikeCloudflareChallenge(body string) bool {
lower := strings.ToLower(body)
return strings.Contains(lower, "cf-mitigated") || strings.Contains(lower, "attention required") || strings.Contains(lower, "just a moment")
}
func truncateBytes(data []byte, limit int) string {
trimmed := strings.TrimSpace(string(data))
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func limitQueries(queries []string, limit int) []string {
seen := map[string]bool{}
filtered := make([]string, 0, minInt(len(queries), limit))