Stabilize search pipeline and improve preview diagnostics
build-push / docker (push) Successful in 4m14s
build-push / docker (push) Successful in 4m14s
This commit is contained in:
+178
-21
@@ -6,6 +6,7 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
@@ -54,6 +55,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
name string
|
||||
categories string
|
||||
engine string
|
||||
maxResults int
|
||||
build func(string) []string
|
||||
accept func(SearchResult) bool
|
||||
}
|
||||
@@ -63,6 +65,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
name: "Envato",
|
||||
categories: "general",
|
||||
engine: s.WebEngine,
|
||||
maxResults: 8,
|
||||
build: buildEnvatoQueries,
|
||||
accept: isRenderableEnvatoResult,
|
||||
},
|
||||
@@ -70,6 +73,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
name: "Artgrid",
|
||||
categories: "general",
|
||||
engine: s.WebEngine,
|
||||
maxResults: 8,
|
||||
build: buildArtgridQueries,
|
||||
accept: isRenderableArtgridResult,
|
||||
},
|
||||
@@ -77,16 +81,18 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
name: "Google Video",
|
||||
categories: "videos",
|
||||
engine: s.GoogleVideoEngine,
|
||||
maxResults: 6,
|
||||
build: buildGoogleVideoQueries,
|
||||
accept: isUsefulGoogleVideoResult,
|
||||
},
|
||||
}
|
||||
|
||||
seen := map[string]bool{}
|
||||
sourceCounts := map[string]int{}
|
||||
results := make([]SearchResult, 0, 90)
|
||||
var lastErr error
|
||||
|
||||
baseQueries := limitQueries(queries, 5)
|
||||
baseQueries := limitQueries(queries, 3)
|
||||
for _, base := range baseQueries {
|
||||
base = strings.TrimSpace(base)
|
||||
if base == "" {
|
||||
@@ -96,7 +102,13 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
if len(enabledPlatforms) > 0 && !enabledPlatforms[strings.ToLower(source.name)] {
|
||||
continue
|
||||
}
|
||||
if sourceCounts[source.name] >= source.maxResults {
|
||||
continue
|
||||
}
|
||||
for _, searchQuery := range source.build(base) {
|
||||
if sourceCounts[source.name] >= source.maxResults {
|
||||
break
|
||||
}
|
||||
items, err := s.search(searchQuery, source.categories, source.engine, source.name)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
@@ -112,6 +124,10 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
}
|
||||
seen[item.Link] = true
|
||||
results = append(results, item)
|
||||
sourceCounts[source.name]++
|
||||
if sourceCounts[source.name] >= source.maxResults {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -128,7 +144,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
|
||||
}
|
||||
|
||||
func (s *SearchService) EnrichResults(results []SearchResult) []SearchResult {
|
||||
limit := minInt(len(results), 24)
|
||||
limit := minInt(len(results), 18)
|
||||
if limit == 0 {
|
||||
return results
|
||||
}
|
||||
@@ -170,14 +186,32 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
|
||||
if err != nil {
|
||||
return result
|
||||
}
|
||||
if result.ThumbnailURL == "" {
|
||||
result.ThumbnailURL = firstNonEmpty(
|
||||
extractMetaContent(html, "og:image"),
|
||||
extractMetaContent(html, "twitter:image"),
|
||||
)
|
||||
result.Title = firstNonEmpty(
|
||||
extractMetaContent(html, "og:title"),
|
||||
result.Title,
|
||||
)
|
||||
result.Snippet = firstNonEmpty(
|
||||
extractMetaContent(html, "og:description"),
|
||||
extractMetaContent(html, "description"),
|
||||
result.Snippet,
|
||||
)
|
||||
|
||||
pageThumbnail := firstNonEmpty(
|
||||
extractMetaContent(html, "og:image"),
|
||||
extractMetaContent(html, "twitter:image"),
|
||||
extractJSONLDValue(html, "thumbnailUrl"),
|
||||
)
|
||||
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
|
||||
result.ThumbnailURL = pageThumbnail
|
||||
}
|
||||
if result.PreviewVideoURL == "" {
|
||||
result.PreviewVideoURL = extractVideoPreviewURL(html)
|
||||
result.PreviewVideoURL = firstNonEmpty(
|
||||
extractJSONLDValue(html, "contentUrl"),
|
||||
extractMetaContent(html, "twitter:player:stream"),
|
||||
extractVideoPreviewURL(html),
|
||||
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
|
||||
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
|
||||
)
|
||||
}
|
||||
return result
|
||||
}
|
||||
@@ -203,17 +237,30 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
|
||||
if result.ThumbnailURL == "" || result.PreviewVideoURL == "" {
|
||||
html, err := s.fetchText(result.Link)
|
||||
if err == nil {
|
||||
if result.ThumbnailURL == "" {
|
||||
result.ThumbnailURL = firstNonEmpty(
|
||||
extractMetaContent(html, "og:image"),
|
||||
extractMetaContent(html, "twitter:image"),
|
||||
)
|
||||
if result.ThumbnailURL == "" {
|
||||
result.ThumbnailURL = extractArtgridBackgroundThumbnail(html, clipID)
|
||||
}
|
||||
result.Title = firstNonEmpty(
|
||||
extractMetaContent(html, "og:title"),
|
||||
result.Title,
|
||||
)
|
||||
result.Snippet = firstNonEmpty(
|
||||
extractMetaContent(html, "og:description"),
|
||||
extractMetaContent(html, "description"),
|
||||
result.Snippet,
|
||||
)
|
||||
pageThumbnail := firstNonEmpty(
|
||||
extractMetaContent(html, "og:image"),
|
||||
extractMetaContent(html, "twitter:image"),
|
||||
extractArtgridBackgroundThumbnail(html, clipID),
|
||||
extractJSONLDValue(html, "image"),
|
||||
)
|
||||
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
|
||||
result.ThumbnailURL = pageThumbnail
|
||||
}
|
||||
if result.PreviewVideoURL == "" {
|
||||
result.PreviewVideoURL = extractVideoPreviewURL(html)
|
||||
result.PreviewVideoURL = firstNonEmpty(
|
||||
extractJSONLDValue(html, "contentUrl"),
|
||||
extractMetaContent(html, "twitter:player:stream"),
|
||||
extractVideoPreviewURL(html),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -282,7 +329,6 @@ func (s *SearchService) search(query, categories, engine, source string) ([]Sear
|
||||
func buildGoogleVideoQueries(base string) []string {
|
||||
return []string{
|
||||
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR "establishing shot" OR editorial) -tutorial -"how to" -review -reaction -course -podcast -vlog -interview -breakdown -edit -editing`, base),
|
||||
fmt.Sprintf(`"%s" ("cinematic footage" OR "free stock footage" OR "4k footage") -tutorial -"how to" -review`, base),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -310,6 +356,8 @@ func isUsefulGoogleVideoResult(result SearchResult) bool {
|
||||
"tutorial", "how to", "review", "reaction", "podcast", "interview", "walkthrough",
|
||||
"course", "lesson", "edit tutorial", "editing tutorial", "premiere pro", "after effects",
|
||||
"breakdown", "explained", "vlog", "tips", "guide", "learn", "free download",
|
||||
"bgm", "music", "song", "lyrics", "audio", "soundtrack", "trailer", "teaser",
|
||||
"full movie", "movie clip", "status", "whatsapp status", "fan cam", "fancam",
|
||||
} {
|
||||
if strings.Contains(text, banned) {
|
||||
return false
|
||||
@@ -477,11 +525,18 @@ func pickVideoURL(urls []string) string {
|
||||
}
|
||||
|
||||
func (s *SearchService) fetchText(target string) (string, error) {
|
||||
resp, err := s.Client.Get(target)
|
||||
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
resp, err := s.Client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
|
||||
return fetchTextViaPython(target)
|
||||
}
|
||||
if resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("fetch returned status %d", resp.StatusCode)
|
||||
}
|
||||
@@ -489,15 +544,17 @@ func (s *SearchService) fetchText(target string) (string, error) {
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if looksLikeCloudflareChallenge(string(data)) {
|
||||
return fetchTextViaPython(target)
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
func (s *SearchService) fetchJSONText(target string) (string, error) {
|
||||
req, err := http.NewRequest(http.MethodGet, target, nil)
|
||||
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Accept", "application/json, text/json")
|
||||
resp, err := s.Client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -522,6 +579,106 @@ func firstNonEmpty(values ...string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func shouldPreferPageThumbnail(current, pageLink string) bool {
|
||||
current = strings.TrimSpace(current)
|
||||
if current == "" {
|
||||
return true
|
||||
}
|
||||
lower := strings.ToLower(current)
|
||||
if strings.Contains(lower, "imgs.search.brave.com") || strings.Contains(lower, "googleusercontent.com") || strings.Contains(lower, "bing.com") {
|
||||
return true
|
||||
}
|
||||
currentHost := hostOf(current)
|
||||
pageHost := hostOf(pageLink)
|
||||
return currentHost == "" || (pageHost != "" && currentHost != pageHost)
|
||||
}
|
||||
|
||||
func hostOf(raw string) string {
|
||||
parsed, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.ToLower(parsed.Host)
|
||||
}
|
||||
|
||||
func extractJSONLDValue(html, key string) string {
|
||||
pattern := regexp.MustCompile(`"` + regexp.QuoteMeta(key) + `"\s*:\s*"(https?:\\?/\\?/[^"]+|[^"]+)"`)
|
||||
matches := pattern.FindAllStringSubmatch(html, -1)
|
||||
for _, match := range matches {
|
||||
if len(match) != 2 {
|
||||
continue
|
||||
}
|
||||
value := strings.ReplaceAll(match[1], `\/`, `/`)
|
||||
value = strings.ReplaceAll(value, `\u002F`, `/`)
|
||||
value = strings.ReplaceAll(value, `\\`, "")
|
||||
value = htmlUnescape(value)
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
|
||||
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
|
||||
if candidate == "" {
|
||||
return ""
|
||||
}
|
||||
candidate = strings.ReplaceAll(candidate, "&", "&")
|
||||
if strings.Contains(candidate, "/video_preview/") {
|
||||
if idx := strings.Index(candidate, "?"); idx >= 0 {
|
||||
candidate = candidate[:idx]
|
||||
}
|
||||
return regexp.MustCompile(`/video_preview/[^/]+\.(?:jpg|jpeg|png|webp)$`).ReplaceAllString(candidate, `/watermarked_preview/watermarked_preview.mp4`)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func newBrowserRequest(method, target, accept string) (*http.Request, error) {
|
||||
req, err := http.NewRequest(method, target, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
if accept != "" {
|
||||
req.Header.Set("Accept", accept)
|
||||
}
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func fetchTextViaPython(target string) (string, error) {
|
||||
script := `
|
||||
from urllib.request import Request, urlopen
|
||||
import sys
|
||||
req = Request(sys.argv[1], headers={
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
})
|
||||
with urlopen(req, timeout=20) as resp:
|
||||
sys.stdout.buffer.write(resp.read(1024 * 1024))
|
||||
`
|
||||
output, err := exec.Command("python3", "-c", script, target).CombinedOutput()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("python fallback failed: %v: %s", err, truncateBytes(output, 300))
|
||||
}
|
||||
return string(output), nil
|
||||
}
|
||||
|
||||
func looksLikeCloudflareChallenge(body string) bool {
|
||||
lower := strings.ToLower(body)
|
||||
return strings.Contains(lower, "cf-mitigated") || strings.Contains(lower, "attention required") || strings.Contains(lower, "just a moment")
|
||||
}
|
||||
|
||||
func truncateBytes(data []byte, limit int) string {
|
||||
trimmed := strings.TrimSpace(string(data))
|
||||
if len(trimmed) <= limit {
|
||||
return trimmed
|
||||
}
|
||||
return trimmed[:limit] + "..."
|
||||
}
|
||||
|
||||
func limitQueries(queries []string, limit int) []string {
|
||||
seen := map[string]bool{}
|
||||
filtered := make([]string, 0, minInt(len(queries), limit))
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
package services
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestExtractVideoPreviewURLFindsEnvatoPreview(t *testing.T) {
|
||||
html := `<script type="application/ld+json">{"contentUrl":"https://video-previews.elements.envatousercontent.com/ad0a3abc-7eb0-4075-8f68-8198f9a08777/watermarked_preview/watermarked_preview.mp4"}</script>`
|
||||
got := firstNonEmpty(extractJSONLDValue(html, "contentUrl"), extractVideoPreviewURL(html))
|
||||
want := "https://video-previews.elements.envatousercontent.com/ad0a3abc-7eb0-4075-8f68-8198f9a08777/watermarked_preview/watermarked_preview.mp4"
|
||||
if got != want {
|
||||
t.Fatalf("expected %q, got %q", want, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeriveEnvatoPreviewFromThumbnail(t *testing.T) {
|
||||
thumb := "https://elements-resized.envatousercontent.com/elements-video-cover-images/ad0a3abc-7eb0-4075-8f68-8198f9a08777/video_preview/video_preview_0000.jpg?w=1200&h=630"
|
||||
got := deriveEnvatoPreviewFromThumbnail(thumb)
|
||||
want := "https://elements-resized.envatousercontent.com/elements-video-cover-images/ad0a3abc-7eb0-4075-8f68-8198f9a08777/watermarked_preview/watermarked_preview.mp4"
|
||||
if got != want {
|
||||
t.Fatalf("expected %q, got %q", want, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsUsefulGoogleVideoResultRejectsMusicResults(t *testing.T) {
|
||||
result := SearchResult{
|
||||
Title: "Couple Friendly Sad Bgm Movie Best Bgm",
|
||||
Link: "https://www.youtube.com/watch?v=LGP4wiXSw8c",
|
||||
Snippet: "romantic bgm soundtrack",
|
||||
}
|
||||
if isUsefulGoogleVideoResult(result) {
|
||||
t.Fatal("expected bgm/music result to be rejected")
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,7 @@ type GeminiService struct {
|
||||
type AIRecommendation struct {
|
||||
Title string `json:"title"`
|
||||
Link string `json:"link"`
|
||||
Snippet string `json:"snippet"`
|
||||
ThumbnailURL string `json:"thumbnailUrl"`
|
||||
PreviewVideoURL string `json:"previewVideoUrl"`
|
||||
Source string `json:"source"`
|
||||
@@ -232,6 +233,7 @@ User query: ` + query,
|
||||
recommendations = append(recommendations, AIRecommendation{
|
||||
Title: src.Title,
|
||||
Link: src.Link,
|
||||
Snippet: src.Snippet,
|
||||
ThumbnailURL: src.ThumbnailURL,
|
||||
PreviewVideoURL: src.PreviewVideoURL,
|
||||
Source: src.Source,
|
||||
@@ -245,6 +247,7 @@ User query: ` + query,
|
||||
recommendations = append(recommendations, AIRecommendation{
|
||||
Title: candidate.Title,
|
||||
Link: candidate.Link,
|
||||
Snippet: candidate.Snippet,
|
||||
ThumbnailURL: candidate.ThumbnailURL,
|
||||
PreviewVideoURL: candidate.PreviewVideoURL,
|
||||
Source: candidate.Source,
|
||||
@@ -262,10 +265,26 @@ func fetchImageAsInlineData(client *http.Client, imageURL string) (string, strin
|
||||
return "", "", fmt.Errorf("image url is empty")
|
||||
}
|
||||
resp, err := client.Get(imageURL)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
if err == nil {
|
||||
defer resp.Body.Close()
|
||||
}
|
||||
if err != nil || resp.StatusCode >= 300 {
|
||||
req, reqErr := newBrowserStyleImageRequest(imageURL)
|
||||
if reqErr != nil {
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
return "", "", reqErr
|
||||
}
|
||||
if resp != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
resp, err = client.Do(req)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 {
|
||||
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
|
||||
@@ -284,6 +303,17 @@ func fetchImageAsInlineData(client *http.Client, imageURL string) (string, strin
|
||||
return base64.StdEncoding.EncodeToString(data), mimeType, nil
|
||||
}
|
||||
|
||||
func newBrowserStyleImageRequest(imageURL string) (*http.Request, error) {
|
||||
req, err := http.NewRequest(http.MethodGet, imageURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func fetchCandidateVisualInlineData(client *http.Client, candidate SearchResult) (string, string, error) {
|
||||
if candidate.ThumbnailURL != "" {
|
||||
data, mimeType, err := fetchImageAsInlineData(client, candidate.ThumbnailURL)
|
||||
|
||||
Reference in New Issue
Block a user