Files
ai-media-hub/backend/services/cse.go
T
2026-03-17 13:20:44 +09:00

1491 lines
45 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package services
import (
"encoding/base64"
"encoding/json"
"fmt"
"io"
"math/rand"
"net/http"
"net/url"
"os/exec"
"regexp"
"sort"
"strings"
"sync"
"time"
)
type SearchResult struct {
Title string `json:"title"`
Link string `json:"link"`
DisplayLink string `json:"displayLink"`
Snippet string `json:"snippet"`
ThumbnailURL string `json:"thumbnailUrl"`
PreviewVideoURL string `json:"previewVideoUrl"`
Source string `json:"source"`
}
type SearchService struct {
BaseURL string
GoogleVideoEngine string
WebEngine string
Client *http.Client
collectors []searchCollector
Debug func(message string, data any)
cacheMu sync.Mutex
searchCache map[string]cachedSearchResults
fetchCache map[string]cachedFetchResult
artgridAPIBlockedUntil time.Time
}
type cachedSearchResults struct {
items []SearchResult
expiresAt time.Time
}
type cachedFetchResult struct {
body string
expiresAt time.Time
}
type SearchExecutionMeta struct {
PartialDueToDeadline bool `json:"partialDueToDeadline"`
}
func NewSearchService(baseURL, googleVideoEngine, webEngine string) *SearchService {
if googleVideoEngine == "" {
googleVideoEngine = "google videos"
}
if webEngine == "" {
webEngine = "google"
}
return &SearchService{
BaseURL: strings.TrimRight(baseURL, "/"),
GoogleVideoEngine: googleVideoEngine,
WebEngine: webEngine,
Client: &http.Client{Timeout: 20 * time.Second},
collectors: []searchCollector{
envatoCollector{},
artgridCollector{},
googleVideoCollector{},
},
searchCache: map[string]cachedSearchResults{},
fetchCache: map[string]cachedFetchResult{},
}
}
func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[string]bool) ([]SearchResult, SearchExecutionMeta, error) {
return s.SearchMediaWithDeadline(queries, enabledPlatforms, time.Time{})
}
func (s *SearchService) SearchMediaWithDeadline(queries []string, enabledPlatforms map[string]bool, deadline time.Time) ([]SearchResult, SearchExecutionMeta, error) {
meta := SearchExecutionMeta{}
if s.BaseURL == "" {
return nil, meta, fmt.Errorf("searxng base url is not configured")
}
s.debug("search_service:start", map[string]any{
"queries": queries,
"enabledPlatforms": enabledPlatforms,
})
seen := map[string]bool{}
sourceCounts := map[string]int{}
results := make([]SearchResult, 0, 90)
var lastErr error
baseQueries := limitQueries(queries, 8)
shuffleStrings(baseQueries)
primaryQueries := baseQueries[:minInt(len(baseQueries), 3)]
runSearchPass := func(bases []string, onlyMissing bool) {
for _, base := range bases {
if !deadline.IsZero() && time.Now().After(deadline) {
meta.PartialDueToDeadline = true
s.debug("search_service:deadline_reached", map[string]any{"stage": "runSearchPass", "base": base})
return
}
base = strings.TrimSpace(base)
if base == "" {
continue
}
for _, collector := range s.collectors {
if !deadline.IsZero() && time.Now().After(deadline) {
meta.PartialDueToDeadline = true
s.debug("search_service:deadline_reached", map[string]any{"stage": "collectorLoop", "collector": collector.Name()})
return
}
if !collector.Enabled(enabledPlatforms) {
continue
}
if sourceCounts[collector.Name()] >= collector.MaxResults() {
continue
}
if onlyMissing && sourceCounts[collector.Name()] > 0 {
continue
}
searchQueries := collector.BuildQueries(base)
shuffleStrings(searchQueries)
searchQueries = limitCollectorQueries(collector.Name(), searchQueries, onlyMissing)
s.debug("search_service:collector_queries", map[string]any{
"collector": collector.Name(),
"base": base,
"onlyMissing": onlyMissing,
"searchQueries": searchQueries,
})
for _, searchQuery := range searchQueries {
if !deadline.IsZero() && time.Now().After(deadline) {
meta.PartialDueToDeadline = true
s.debug("search_service:deadline_reached", map[string]any{"stage": "queryLoop", "collector": collector.Name(), "query": searchQuery})
return
}
if sourceCounts[collector.Name()] >= collector.MaxResults() {
break
}
items, err := collector.Collect(s, searchQuery)
if err != nil {
s.debug("search_service:collector_error", map[string]any{
"collector": collector.Name(),
"query": searchQuery,
"error": err.Error(),
})
lastErr = err
continue
}
s.debug("search_service:collector_results", map[string]any{
"collector": collector.Name(),
"query": searchQuery,
"rawCount": len(items),
"sourceCount": sourceCounts[collector.Name()],
})
for _, item := range items {
item = normalizeResultForCollector(collector.Name(), item)
if item.Link == "" || seen[item.Link] || !collector.Accept(item) {
continue
}
seen[item.Link] = true
results = append(results, item)
sourceCounts[collector.Name()]++
if sourceCounts[collector.Name()] >= collector.MaxResults() {
break
}
}
}
}
}
}
runSearchPass(primaryQueries, false)
if len(baseQueries) > len(primaryQueries) {
runSearchPass(baseQueries[len(primaryQueries):], true)
}
if len(results) == 0 && lastErr != nil {
return nil, meta, lastErr
}
sort.SliceStable(results, func(i, j int) bool {
return sourceWeight(results[i].Source) > sourceWeight(results[j].Source)
})
s.debug("search_service:complete", map[string]any{
"resultCount": len(results),
"sourceCounts": sourceCounts,
"hadError": lastErr != nil,
"partialDueToDeadline": meta.PartialDueToDeadline,
})
enriched, enrichMeta := s.EnrichResultsWithDeadline(results, deadline)
meta.PartialDueToDeadline = meta.PartialDueToDeadline || enrichMeta.PartialDueToDeadline
return enriched, meta, nil
}
func (s *SearchService) EnrichResults(results []SearchResult) []SearchResult {
enriched, _ := s.EnrichResultsWithDeadline(results, time.Time{})
return enriched
}
func (s *SearchService) EnrichResultsWithDeadline(results []SearchResult, deadline time.Time) ([]SearchResult, SearchExecutionMeta) {
meta := SearchExecutionMeta{}
limit := minInt(len(results), 18)
if limit == 0 {
return results, meta
}
s.debug("search_service:enrich_start", map[string]any{
"total": len(results),
"limit": limit,
})
enriched := make([]SearchResult, len(results))
copy(enriched, results)
var wg sync.WaitGroup
var metaMu sync.Mutex
sem := make(chan struct{}, 4)
for idx := 0; idx < limit; idx++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
if !deadline.IsZero() && time.Now().After(deadline) {
metaMu.Lock()
meta.PartialDueToDeadline = true
metaMu.Unlock()
return
}
sem <- struct{}{}
defer func() { <-sem }()
s.debug("search_service:enrich_item_start", map[string]any{
"index": i,
"link": enriched[i].Link,
"source": enriched[i].Source,
})
enriched[i] = s.enrichResult(enriched[i])
s.debug("search_service:enrich_item_done", map[string]any{
"index": i,
"link": enriched[i].Link,
"source": enriched[i].Source,
"thumbnail": strings.TrimSpace(enriched[i].ThumbnailURL) != "",
"preview": strings.TrimSpace(enriched[i].PreviewVideoURL) != "",
"title": truncateForDebug(enriched[i].Title, 120),
})
}(idx)
}
wg.Wait()
s.debug("search_service:enrich_complete", map[string]any{"limit": limit})
return enriched, meta
}
func (s *SearchService) enrichResult(result SearchResult) SearchResult {
for _, collector := range s.collectors {
if collector.Name() == result.Source {
return collector.Enrich(s, result)
}
}
if result.ThumbnailURL == "" {
result.ThumbnailURL = deriveThumbnail(result.Link)
}
return result
}
func (s *SearchService) searchWithFallback(query, categories, engine, source string) ([]SearchResult, error) {
s.debug("search_service:search_with_fallback", map[string]any{
"query": query,
"categories": categories,
"engine": engine,
"source": source,
})
items, err := s.search(query, categories, engine, source)
if err == nil {
return items, nil
}
s.debug("search_service:search_with_fallback_primary_error", map[string]any{
"query": query,
"engine": engine,
"error": err.Error(),
})
if strings.TrimSpace(engine) == "" {
return nil, err
}
return s.search(query, categories, "", source)
}
func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
s.debug("search_service:enrich_envato_start", map[string]any{"link": result.Link})
html, err := s.fetchText(result.Link)
if err != nil {
s.debug("search_service:enrich_envato_fetch_error", map[string]any{"link": result.Link, "error": err.Error()})
return result
}
videoMeta := extractVideoObjectJSONLD(html)
result.Title = firstNonEmpty(
cleanEnvatoTitle(videoMeta.Name),
extractMetaContent(html, "og:title"),
result.Title,
)
result.Snippet = firstNonEmpty(
cleanEnvatoDescription(videoMeta.Description),
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
videoMeta.ThumbnailURL,
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
extractJSONLDValue(html, "thumbnailUrl"),
)
if hasUsableThumbnail(pageThumbnail) && shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
result.ThumbnailURL = pageThumbnail
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = collectEnvatoPreviewURL(html, pageThumbnail, result.ThumbnailURL, videoMeta.ContentURL)
}
if result.PreviewVideoURL == "" {
time.Sleep(1200 * time.Millisecond)
if retryHTML, retryErr := s.fetchText(result.Link); retryErr == nil {
result.PreviewVideoURL = collectEnvatoPreviewURL(retryHTML, pageThumbnail, result.ThumbnailURL, "")
}
}
s.debug("search_service:enrich_envato_done", map[string]any{
"link": result.Link,
"thumbnail": strings.TrimSpace(result.ThumbnailURL) != "",
"preview": strings.TrimSpace(result.PreviewVideoURL) != "",
})
return result
}
func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
clipID := extractArtgridClipID(result.Link)
if clipID == "" {
s.debug("search_service:enrich_artgrid_skip", map[string]any{"link": result.Link, "reason": "missing clip id"})
return result
}
s.debug("search_service:enrich_artgrid_start", map[string]any{"link": result.Link, "clipId": clipID})
apiURL := "https://artgrid.io/api/clip/details?clipId=" + clipID
var err error
if s.shouldSkipArtgridAPI() {
s.debug("search_service:enrich_artgrid_api_skip", map[string]any{
"link": result.Link,
"clipId": clipID,
"reason": "cached_403_guard",
})
} else {
var body string
body, err = s.fetchJSONText(apiURL)
if err == nil {
urls := collectURLs(body)
if !hasUsableThumbnail(result.ThumbnailURL) {
result.ThumbnailURL = pickArtgridImageURL(urls, clipID)
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = pickVideoURL(urls)
}
}
if err != nil {
if strings.Contains(err.Error(), "status 403") {
s.blockArtgridAPI(15 * time.Minute)
}
s.debug("search_service:enrich_artgrid_api_error", map[string]any{"link": result.Link, "clipId": clipID, "error": err.Error()})
}
}
if result.ThumbnailURL == "" || result.PreviewVideoURL == "" {
html, err := s.fetchText(result.Link)
if err == nil {
if !isMatchingArtgridClipPage(html, clipID) {
s.debug("search_service:enrich_artgrid_html_mismatch", map[string]any{
"link": result.Link,
"clipId": clipID,
"signals": artgridHTMLSignals(html, clipID),
})
return result
}
result.Title = firstNonEmpty(
cleanArtgridTitle(extractMetaContent(html, "og:title")),
cleanArtgridTitle(extractHTMLTitle(html)),
result.Title,
)
result.Snippet = firstNonEmpty(
cleanArtgridDescription(extractMetaContent(html, "og:description")),
cleanArtgridDescription(extractMetaContent(html, "description")),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
extractArtgridBackgroundThumbnail(html, clipID),
extractJSONLDValue(html, "image"),
pickArtgridImageURL(collectURLs(html), clipID),
)
if hasUsableThumbnail(pageThumbnail) && shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
result.ThumbnailURL = pageThumbnail
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = firstNonEmpty(
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractMetaContent(html, "og:video"),
extractMetaContent(html, "og:video:url"),
extractMetaContent(html, "og:video:secure_url"),
extractVideoPreviewURL(html),
pickVideoURL(collectURLs(html)),
)
}
if result.PreviewVideoURL == "" {
time.Sleep(1200 * time.Millisecond)
if retryHTML, retryErr := s.fetchText(result.Link); retryErr == nil {
result.PreviewVideoURL = firstNonEmpty(
extractJSONLDValue(retryHTML, "contentUrl"),
extractMetaContent(retryHTML, "twitter:player:stream"),
extractVideoPreviewURL(retryHTML),
)
}
}
}
}
s.debug("search_service:enrich_artgrid_done", map[string]any{
"link": result.Link,
"clipId": clipID,
"thumbnail": strings.TrimSpace(result.ThumbnailURL) != "",
"preview": strings.TrimSpace(result.PreviewVideoURL) != "",
})
return result
}
func (s *SearchService) search(query, categories, engine, source string) ([]SearchResult, error) {
cacheKey := strings.Join([]string{
s.BaseURL,
query,
categories,
engine,
source,
}, "\n")
if cached, ok := s.getCachedSearchResults(cacheKey); ok {
s.debug("search_service:searx_cache_hit", map[string]any{
"query": query,
"categories": categories,
"engine": engine,
"source": source,
"count": len(cached),
})
return cached, nil
}
values := url.Values{}
values.Set("q", query)
values.Set("format", "json")
values.Set("safesearch", "0")
values.Set("language", "en-US")
if categories != "" {
values.Set("categories", categories)
}
if engine != "" {
values.Set("engines", engine)
}
endpoint := s.BaseURL + "/search?" + values.Encode()
s.debug("search_service:searx_request", map[string]any{
"endpoint": endpoint,
"query": query,
"categories": categories,
"engine": engine,
"source": source,
})
resp, err := s.Client.Get(endpoint)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return nil, fmt.Errorf("searxng returned status %d for query %q", resp.StatusCode, query)
}
var payload struct {
Results []struct {
Title string `json:"title"`
URL string `json:"url"`
Content string `json:"content"`
Thumbnail string `json:"thumbnail"`
ThumbnailSrc string `json:"thumbnail_src"`
ImgSrc string `json:"img_src"`
ParsedURL []any `json:"parsed_url"`
Engine string `json:"engine"`
} `json:"results"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("searxng JSON decode failed for query %q: %w", query, err)
}
s.debug("search_service:searx_response", map[string]any{
"query": query,
"source": source,
"rawCount": len(payload.Results),
})
results := make([]SearchResult, 0, len(payload.Results))
for _, item := range payload.Results {
link := strings.TrimSpace(item.URL)
if link == "" {
continue
}
results = append(results, SearchResult{
Title: item.Title,
Link: link,
DisplayLink: inferDisplayLink(link, item.ParsedURL),
Snippet: item.Content,
ThumbnailURL: firstNonEmpty(item.Thumbnail, item.ThumbnailSrc, item.ImgSrc, deriveThumbnail(link)),
Source: normalizeSource(source, link, item.Engine),
})
}
s.setCachedSearchResults(cacheKey, results, 2*time.Minute)
return results, nil
}
func (s *SearchService) getCachedSearchResults(key string) ([]SearchResult, bool) {
s.cacheMu.Lock()
defer s.cacheMu.Unlock()
entry, ok := s.searchCache[key]
if !ok {
return nil, false
}
if time.Now().After(entry.expiresAt) {
delete(s.searchCache, key)
return nil, false
}
return cloneSearchResults(entry.items), true
}
func (s *SearchService) setCachedSearchResults(key string, items []SearchResult, ttl time.Duration) {
s.cacheMu.Lock()
defer s.cacheMu.Unlock()
s.searchCache[key] = cachedSearchResults{
items: cloneSearchResults(items),
expiresAt: time.Now().Add(ttl),
}
}
func (s *SearchService) getCachedFetchResult(key string) (string, bool) {
s.cacheMu.Lock()
defer s.cacheMu.Unlock()
entry, ok := s.fetchCache[key]
if !ok {
return "", false
}
if time.Now().After(entry.expiresAt) {
delete(s.fetchCache, key)
return "", false
}
return entry.body, true
}
func (s *SearchService) setCachedFetchResult(key, body string, ttl time.Duration) {
s.cacheMu.Lock()
defer s.cacheMu.Unlock()
s.fetchCache[key] = cachedFetchResult{
body: body,
expiresAt: time.Now().Add(ttl),
}
}
func (s *SearchService) shouldSkipArtgridAPI() bool {
s.cacheMu.Lock()
defer s.cacheMu.Unlock()
return !s.artgridAPIBlockedUntil.IsZero() && time.Now().Before(s.artgridAPIBlockedUntil)
}
func (s *SearchService) blockArtgridAPI(ttl time.Duration) {
s.cacheMu.Lock()
defer s.cacheMu.Unlock()
s.artgridAPIBlockedUntil = time.Now().Add(ttl)
}
func (s *SearchService) debug(message string, data any) {
if s != nil && s.Debug != nil {
s.Debug(message, data)
}
}
func truncateForDebug(text string, limit int) string {
trimmed := strings.TrimSpace(text)
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func buildGoogleVideoQueries(base string) []string {
return []string{
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR "establishing shot" OR editorial) -tutorial -"how to" -review -reaction -course -podcast -vlog -interview -breakdown -edit -editing`, base),
fmt.Sprintf(`"%s" ("cinematic b-roll" OR "establishing shot" OR "drone footage" OR "urban footage") -tutorial -reaction -vlog -podcast`, base),
fmt.Sprintf(`"%s" ("night drive" OR "city footage" OR "street footage" OR "editorial footage") -tutorial -review -music`, base),
}
}
func buildEnvatoQueries(base string) []string {
return []string{
fmt.Sprintf(`%s ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com`, base),
fmt.Sprintf(`%s ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com/stock-video`, base),
fmt.Sprintf(`%s ("motion graphics" OR "backgrounds" OR "establishing shot" OR "loop") site:elements.envato.com`, base),
fmt.Sprintf(`%s ("urban" OR "night city" OR "cyberpunk" OR "sci-fi") site:elements.envato.com`, base),
fmt.Sprintf(`"%s" site:elements.envato.com`, base),
}
}
func buildArtgridQueries(base string) []string {
return []string{
fmt.Sprintf(`%s ("stock footage" OR "b-roll" OR cinematic OR editorial) site:artgrid.io/clip/`, base),
fmt.Sprintf(`%s ("footage" OR "cinematic" OR "establishing shot") site:artgrid.io/clip/`, base),
fmt.Sprintf(`%s ("stock footage" OR "b-roll" OR cinematic OR editorial) site:artlist.io/stock-footage/clip/`, base),
fmt.Sprintf(`%s ("footage" OR "cinematic" OR "establishing shot") site:artlist.io/stock-footage/clip/`, base),
fmt.Sprintf(`%s ("night drive" OR "urban night" OR "wet road" OR "cyberpunk") site:artgrid.io/clip/`, base),
fmt.Sprintf(`%s ("drone" OR "city skyline" OR "street scene" OR "mood shot") site:artlist.io/stock-footage/clip/`, base),
fmt.Sprintf(`"%s" site:artgrid.io/clip/`, base),
}
}
func isUsefulGoogleVideoResult(result SearchResult) bool {
lowerLink := strings.ToLower(result.Link)
if !(strings.Contains(lowerLink, "youtube.com/watch") || strings.Contains(lowerLink, "youtu.be/") || strings.Contains(lowerLink, "youtube.com/shorts/")) {
return false
}
text := strings.ToLower(result.Title + " " + result.Snippet)
for _, banned := range []string{
"tutorial", "how to", "review", "reaction", "podcast", "interview", "walkthrough",
"course", "lesson", "edit tutorial", "editing tutorial", "premiere pro", "after effects",
"breakdown", "explained", "vlog", "tips", "guide", "learn", "free download",
"bgm", "music", "song", "lyrics", "audio", "soundtrack", "trailer", "teaser",
"full movie", "movie clip", "status", "whatsapp status", "fan cam", "fancam",
} {
if strings.Contains(text, banned) {
return false
}
}
return true
}
func isRenderableEnvatoResult(result SearchResult) bool {
parsed, err := url.Parse(result.Link)
if err != nil {
return false
}
host := strings.ToLower(parsed.Host)
path := strings.Trim(parsed.Path, "/")
if strings.Contains(host, "elements.envato.com") {
if path == "" || strings.Contains(path, "/stock-video") || strings.Contains(path, "/video-templates") {
return false
}
return regexp.MustCompile(`-[A-Z0-9]{6,}$`).MatchString(path)
}
return false
}
func isRenderableArtgridResult(result SearchResult) bool {
parsed, err := url.Parse(result.Link)
if err != nil {
return false
}
host := strings.ToLower(parsed.Host)
switch {
case strings.Contains(host, "artgrid.io"):
return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path)
case strings.Contains(host, "artlist.io"):
trimmedPath := strings.TrimSuffix(parsed.Path, "/")
return regexp.MustCompile(`^/stock-footage/clip/.+/[0-9]+$`).MatchString(trimmedPath)
default:
return false
}
}
func normalizeSource(source, link, engine string) string {
switch {
case source != "":
return source
case strings.Contains(strings.ToLower(link), "envato") || strings.Contains(strings.ToLower(link), "videohive"):
return "Envato"
case strings.Contains(strings.ToLower(link), "artgrid"), strings.Contains(strings.ToLower(link), "artlist.io/stock-footage/clip/"):
return "Artgrid"
case strings.Contains(strings.ToLower(engine), "google"):
return "Google Video"
default:
return engine
}
}
func inferDisplayLink(link string, parsed []any) string {
if len(parsed) > 1 {
if host, ok := parsed[1].(string); ok {
return host
}
}
if parsedURL, err := url.Parse(link); err == nil {
return parsedURL.Host
}
return ""
}
func deriveThumbnail(link string) string {
if videoID := extractYouTubeID(link); videoID != "" {
return "https://i.ytimg.com/vi/" + videoID + "/hqdefault.jpg"
}
return ""
}
func isLowValueThumbnail(raw string) bool {
lower := strings.ToLower(strings.TrimSpace(raw))
if lower == "" {
return true
}
for _, token := range []string{
"favicon", "apple-touch-icon", "/logo", "/icon", "icon.", "logo.", "placehold.co",
} {
if strings.Contains(lower, token) {
return true
}
}
for _, host := range []string{
"googleusercontent.com", "gstatic.com", "bing.com", "duckduckgo.com", "icons.duckduckgo.com",
} {
if strings.Contains(lower, host) && !strings.Contains(lower, "ytimg.com") {
return true
}
}
return false
}
func hasUsableThumbnail(raw string) bool {
return strings.TrimSpace(raw) != "" && !isLowValueThumbnail(raw)
}
func HasUsableThumbnail(raw string) bool {
return hasUsableThumbnail(raw)
}
func IsLowValueThumbnail(raw string) bool {
return isLowValueThumbnail(raw)
}
func buildEmbedURL(source, link string) string {
trimmed := strings.TrimSpace(link)
if trimmed == "" {
return ""
}
if strings.EqualFold(strings.TrimSpace(source), "Google Video") {
if videoID := extractYouTubeID(trimmed); videoID != "" {
return "https://www.youtube-nocookie.com/embed/" + videoID + "?autoplay=1&rel=0&playsinline=1&modestbranding=1&enablejsapi=1"
}
}
return trimmed
}
func defaultMediaMode(source, link, previewURL, thumbnailURL string) (string, string, string) {
embedURL := buildEmbedURL(source, link)
switch source {
case "Google Video":
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, "webpage_like_preview_preferred"
}
return "none", embedURL, "webpage_like_preview_preferred"
case "Envato":
if strings.TrimSpace(previewURL) != "" {
return "preview_video", embedURL, "provider_embed_blocked"
}
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, "provider_embed_blocked"
}
if embedURL != "" {
return "embed", embedURL, ""
}
return "none", "", "provider_embed_blocked"
case "Artgrid":
if strings.TrimSpace(previewURL) != "" {
return "preview_video", embedURL, "provider_preview_unavailable"
}
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, "provider_preview_unavailable"
}
if embedURL != "" {
return "embed", embedURL, ""
}
return "none", "", "provider_preview_unavailable"
default:
if strings.TrimSpace(previewURL) != "" {
return "preview_video", embedURL, ""
}
if hasUsableThumbnail(thumbnailURL) {
return "thumbnail", embedURL, ""
}
if embedURL != "" {
return "embed", embedURL, ""
}
return "none", "", ""
}
}
func DecorateRecommendationMedia(item AIRecommendation) AIRecommendation {
item.EmbedURL = buildEmbedURL(item.Source, item.Link)
item.MediaMode, _, item.PreviewBlockedReason = defaultMediaMode(item.Source, item.Link, item.PreviewVideoURL, item.ThumbnailURL)
if item.MediaMode == "embed" && item.EmbedURL == "" {
item.MediaMode = "none"
}
if item.MediaMode == "thumbnail" && !hasUsableThumbnail(item.ThumbnailURL) && strings.TrimSpace(item.PreviewVideoURL) != "" {
item.MediaMode = "preview_video"
}
switch item.Source {
case "Google Video":
item.ActionType = "download"
item.ActionLabel = "Direct Download"
item.SecondaryActionLabel = "Open Source"
case "Envato", "Artgrid":
item.ActionType = "open_source"
item.ActionLabel = "Open Source"
default:
item.ActionType = "open_source"
item.ActionLabel = "Open Source"
}
return item
}
func extractYouTubeID(link string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?:v=|\/shorts\/|\/embed\/)([A-Za-z0-9_-]{11})`),
regexp.MustCompile(`youtu\.be\/([A-Za-z0-9_-]{11})`),
}
for _, pattern := range patterns {
matches := pattern.FindStringSubmatch(link)
if len(matches) == 2 {
return matches[1]
}
}
return ""
}
func extractMetaContent(html, property string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?i)<meta[^>]+property=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
regexp.MustCompile(`(?i)<meta[^>]+name=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
}
for _, pattern := range patterns {
matches := pattern.FindStringSubmatch(html)
if len(matches) == 2 {
return htmlUnescape(matches[1])
}
}
return ""
}
func extractHTMLTitle(html string) string {
pattern := regexp.MustCompile(`(?is)<title[^>]*>(.*?)</title>`)
matches := pattern.FindStringSubmatch(html)
if len(matches) == 2 {
return htmlUnescape(strings.TrimSpace(matches[1]))
}
return ""
}
func extractVideoPreviewURL(html string) string {
normalizedHTML := strings.ReplaceAll(html, `\\\/`, `/`)
normalizedHTML = strings.ReplaceAll(normalizedHTML, `\/`, `/`)
normalizedHTML = strings.ReplaceAll(normalizedHTML, `\u002F`, `/`)
pattern := regexp.MustCompile(`https?://[^"'[:space:]>]+(?:mp4|m3u8)(?:\?[^"'[:space:]>]*)?`)
matches := pattern.FindAllString(normalizedHTML, -1)
for _, match := range matches {
candidate := strings.TrimSpace(strings.ReplaceAll(match, `\\`, ""))
if strings.Contains(strings.ToLower(candidate), "preview") || strings.Contains(strings.ToLower(candidate), "video") || strings.Contains(strings.ToLower(candidate), "watermark") {
return candidate
}
}
for _, match := range matches {
candidate := strings.TrimSpace(strings.ReplaceAll(match, `\\`, ""))
if strings.TrimSpace(candidate) != "" {
return candidate
}
}
return ""
}
func extractArtgridBackgroundThumbnail(html, clipID string) string {
pattern := regexp.MustCompile(`https://[^"'\\s>]+(?:artgrid\.imgix\.net|cms-public-artifacts\.artlist\.io|artlist-content-images\.imgix\.net)[^"'\\s>]+(?:jpeg|jpg|png|webp)`)
matches := pattern.FindAllString(html, -1)
for _, match := range matches {
if strings.Contains(match, clipID) || strings.Contains(strings.ToLower(match), "graded-thumbnail") {
return match
}
}
return ""
}
func pickArtgridImageURL(urls []string, clipID string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if !(strings.Contains(lower, ".jpg") || strings.Contains(lower, ".jpeg") || strings.Contains(lower, ".png") || strings.Contains(lower, ".webp")) {
continue
}
if strings.Contains(item, clipID) || strings.Contains(lower, "graded-thumbnail") || strings.Contains(lower, "imgix") {
return item
}
}
return pickImageURL(urls)
}
func extractArtgridClipID(link string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`/clip/([0-9]+)/`),
regexp.MustCompile(`/stock-footage/clip/[^/]+/([0-9]+)$`),
regexp.MustCompile(`/stock-footage/clip/.+/([0-9]+)$`),
}
for _, pattern := range patterns {
matches := pattern.FindStringSubmatch(strings.TrimSuffix(link, "/"))
if len(matches) == 2 {
return matches[1]
}
}
return ""
}
func canonicalizeArtgridLink(link string) string {
trimmed := strings.TrimSpace(link)
if trimmed == "" {
return ""
}
clipID := extractArtgridClipID(trimmed)
if clipID == "" {
return trimmed
}
if strings.Contains(strings.ToLower(trimmed), "artgrid.io/clip/") {
return trimmed
}
parsed, err := url.Parse(trimmed)
if err != nil {
return trimmed
}
segments := strings.Split(strings.Trim(parsed.Path, "/"), "/")
slug := clipID
for idx, segment := range segments {
if segment == clipID && idx > 0 {
slug = segments[idx-1]
break
}
}
return "https://artgrid.io/clip/" + clipID + "/" + slug
}
func normalizeResultForCollector(source string, result SearchResult) SearchResult {
switch source {
case "Artgrid":
result.Link = canonicalizeArtgridLink(result.Link)
result.Source = "Artgrid"
case "Envato":
result.Source = "Envato"
case "Google Video":
result.Source = "Google Video"
}
return result
}
func collectURLs(body string) []string {
pattern := regexp.MustCompile(`https?:\/\/[^"'\\\s]+`)
matches := pattern.FindAllString(body, -1)
seen := map[string]bool{}
results := make([]string, 0, len(matches))
for _, match := range matches {
candidate := strings.TrimSpace(strings.Trim(match, `"'`))
if candidate == "" || seen[candidate] {
continue
}
seen[candidate] = true
results = append(results, candidate)
}
return results
}
func pickImageURL(urls []string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, ".jpg") || strings.Contains(lower, ".jpeg") || strings.Contains(lower, ".png") || strings.Contains(lower, ".webp") {
return item
}
}
return ""
}
func pickVideoURL(urls []string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, ".m3u8") && (strings.Contains(lower, "artgrid") || strings.Contains(lower, "artlist") || strings.Contains(lower, "cdn")) {
return item
}
}
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, ".mp4") || strings.Contains(lower, ".m3u8") {
return item
}
}
return ""
}
func (s *SearchService) fetchText(target string) (string, error) {
cacheKey := "html\n" + target
if cached, ok := s.getCachedFetchResult(cacheKey); ok {
s.debug("search_service:fetch_cache_hit", map[string]any{"type": "html", "target": target, "bytes": len(cached)})
return cached, nil
}
for _, strategy := range []string{"default", "provider"} {
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", strategy)
if err != nil {
return "", err
}
s.debug("search_service:envato_fetch_strategy", map[string]any{"target": target, "strategy": strategy})
resp, err := s.Client.Do(req)
if err != nil {
continue
}
data, readErr := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
_ = resp.Body.Close()
if readErr != nil {
continue
}
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
continue
}
if resp.StatusCode >= 300 {
continue
}
if looksLikeCloudflareChallenge(string(data)) {
continue
}
body := string(data)
s.setCachedFetchResult(cacheKey, body, 3*time.Minute)
return body, nil
}
body, err := fetchTextViaPython(target)
if err != nil {
return "", err
}
s.setCachedFetchResult(cacheKey, body, 3*time.Minute)
return body, nil
}
func (s *SearchService) fetchJSONText(target string) (string, error) {
cacheKey := "json\n" + target
if cached, ok := s.getCachedFetchResult(cacheKey); ok {
s.debug("search_service:fetch_cache_hit", map[string]any{"type": "json", "target": target, "bytes": len(cached)})
return cached, nil
}
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*", "provider")
if err != nil {
return "", err
}
resp, err := s.Client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("json fetch returned status %d", resp.StatusCode)
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
if err != nil {
return "", err
}
body := string(data)
s.setCachedFetchResult(cacheKey, body, 3*time.Minute)
return body, nil
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
func shouldPreferPageThumbnail(current, pageLink string) bool {
current = strings.TrimSpace(current)
if current == "" {
return true
}
lower := strings.ToLower(current)
if strings.Contains(lower, "imgs.search.brave.com") || strings.Contains(lower, "googleusercontent.com") || strings.Contains(lower, "bing.com") {
return true
}
currentHost := hostOf(current)
pageHost := hostOf(pageLink)
return currentHost == "" || (pageHost != "" && currentHost != pageHost)
}
func hostOf(raw string) string {
parsed, err := url.Parse(raw)
if err != nil {
return ""
}
return strings.ToLower(parsed.Host)
}
func extractJSONLDValue(html, key string) string {
pattern := regexp.MustCompile(`"` + regexp.QuoteMeta(key) + `"\s*:\s*"(https?:\\?/\\?/[^"]+|[^"]+)"`)
matches := pattern.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) != 2 {
continue
}
value := strings.ReplaceAll(match[1], `\/`, `/`)
value = strings.ReplaceAll(value, `\u002F`, `/`)
value = strings.ReplaceAll(value, `\\`, "")
value = htmlUnescape(value)
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
type videoObjectMetadata struct {
Name string
Description string
ThumbnailURL string
ContentURL string
}
func extractVideoObjectJSONLD(html string) videoObjectMetadata {
pattern := regexp.MustCompile(`(?is)<script[^>]+type=["']application/ld\+json["'][^>]*>(.*?)</script>`)
matches := pattern.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) != 2 {
continue
}
var payload map[string]any
if err := json.Unmarshal([]byte(htmlUnescape(strings.TrimSpace(match[1]))), &payload); err != nil {
continue
}
typeName, _ := payload["@type"].(string)
if !strings.EqualFold(typeName, "VideoObject") {
continue
}
meta := videoObjectMetadata{
Name: stringValue(payload["name"]),
Description: stringValue(payload["description"]),
ThumbnailURL: stringValue(payload["thumbnailUrl"]),
ContentURL: stringValue(payload["contentUrl"]),
}
if meta.Name != "" || meta.Description != "" || meta.ThumbnailURL != "" || meta.ContentURL != "" {
return meta
}
}
return videoObjectMetadata{}
}
func stringValue(value any) string {
switch typed := value.(type) {
case string:
return htmlUnescape(strings.TrimSpace(typed))
case []any:
for _, item := range typed {
if text := stringValue(item); text != "" {
return text
}
}
}
return ""
}
func cleanEnvatoTitle(title string) string {
title = htmlUnescape(strings.TrimSpace(title))
return strings.TrimSuffix(title, " - Envato")
}
func cleanEnvatoDescription(description string) string {
description = htmlUnescape(strings.TrimSpace(description))
description = strings.ReplaceAll(description, "&amp;", "&")
return description
}
func cleanArtgridTitle(title string) string {
title = htmlUnescape(strings.TrimSpace(title))
replacements := []string{
" | Stock Video Footage * Artgrid.io*",
" | Stock Video Footage - Artgrid.io",
" | Royalty Free Stock Footage Artgrid.io",
" | Royalty Free Stock Footage - Artgrid.io",
}
for _, suffix := range replacements {
title = strings.TrimSuffix(title, suffix)
}
if idx := strings.Index(title, " by "); idx > 0 {
title = title[:idx]
}
return strings.TrimSpace(title)
}
func cleanArtgridDescription(description string) string {
description = htmlUnescape(strings.TrimSpace(description))
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid.")
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid")
if parts := strings.SplitN(description, " | ", 2); len(parts) == 2 {
description = parts[1]
}
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage Artgrid.io")
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage - Artgrid.io")
return strings.TrimSpace(description)
}
func artgridHTMLSignals(html, clipID string) map[string]bool {
ogURL := extractMetaContent(html, "og:url")
canonical := extractCanonicalURL(html)
alWebURL := extractMetaContent(html, "al:web:url")
lowerHTML := strings.ToLower(html)
title := strings.ToLower(extractHTMLTitle(html))
ogImage := strings.ToLower(extractMetaContent(html, "og:image"))
twitterImage := strings.ToLower(extractMetaContent(html, "twitter:image"))
return map[string]bool{
"og_url_clip": strings.Contains(ogURL, clipID),
"canonical_clip": strings.Contains(canonical, clipID),
"al_web_clip": strings.Contains(alWebURL, clipID),
"body_main_clipvideo": strings.Contains(lowerHTML, "main-clipvideo_"+clipID),
"body_clip_path": strings.Contains(lowerHTML, "/clip/"+clipID+"/"),
"body_clip_id": strings.Contains(lowerHTML, clipID),
"title_mentions_clip": strings.Contains(title, "artgrid") || strings.Contains(title, "artlist"),
"image_clip": strings.Contains(ogImage, strings.ToLower(clipID)) || strings.Contains(twitterImage, strings.ToLower(clipID)),
}
}
func isMatchingArtgridClipPage(html, clipID string) bool {
if clipID == "" {
return false
}
signals := artgridHTMLSignals(html, clipID)
if signals["og_url_clip"] || signals["canonical_clip"] || signals["al_web_clip"] || signals["body_main_clipvideo"] || signals["body_clip_path"] || signals["image_clip"] {
return true
}
if signals["body_clip_id"] && signals["title_mentions_clip"] {
return true
}
return false
}
func extractCanonicalURL(html string) string {
pattern := regexp.MustCompile(`(?i)<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)`)
matches := pattern.FindStringSubmatch(html)
if len(matches) == 2 {
return htmlUnescape(matches[1])
}
return ""
}
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
if candidate == "" {
return ""
}
candidate = strings.ReplaceAll(candidate, "&amp;", "&")
if strings.Contains(candidate, "/video_preview/") {
if idx := strings.Index(candidate, "?"); idx >= 0 {
candidate = candidate[:idx]
}
return regexp.MustCompile(`/video_preview/[^/]+\.(?:jpg|jpeg|png|webp)$`).ReplaceAllString(candidate, `/watermarked_preview/watermarked_preview.mp4`)
}
return ""
}
func extractEnvatoPreviewFromHydration(html string) string {
encoded := extractWindowAssignedValue(html, "INITIAL_HYDRATION_DATA")
if encoded == "" {
return ""
}
decoded, err := base64.StdEncoding.DecodeString(encoded)
if err != nil {
return ""
}
urls := collectURLs(string(decoded))
return firstNonEmpty(pickBestEnvatoPreviewURL(urls), extractVideoPreviewURL(string(decoded)))
}
func collectEnvatoPreviewURL(html, pageThumbnail, currentThumbnail, contentURL string) string {
urls := collectURLs(html)
return firstNonEmpty(
contentURL,
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractMetaContent(html, "og:video"),
extractMetaContent(html, "og:video:url"),
extractMetaContent(html, "og:video:secure_url"),
extractEnvatoPreviewFromHydration(html),
pickBestEnvatoPreviewURL(urls),
extractVideoPreviewURL(html),
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
deriveEnvatoPreviewFromThumbnail(currentThumbnail),
)
}
func extractWindowAssignedValue(html, variable string) string {
pattern := regexp.MustCompile(`window\.` + regexp.QuoteMeta(variable) + `\s*=\s*"([^"]+)"`)
matches := pattern.FindStringSubmatch(html)
if len(matches) == 2 {
return matches[1]
}
return ""
}
func pickBestEnvatoPreviewURL(urls []string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, "envatousercontent.com") && strings.HasSuffix(lower, ".mp4") {
return item
}
}
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, "video-previews.elements.envatousercontent.com") && strings.Contains(lower, "watermarked_preview") && strings.HasSuffix(lower, ".mp4") {
return item
}
}
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, "envatousercontent.com") && strings.Contains(lower, "watermarked_preview") && strings.HasSuffix(lower, ".mp4") {
return item
}
}
return ""
}
func inferFetchReferer(target string) string {
lower := strings.ToLower(target)
switch {
case strings.Contains(lower, "envatousercontent.com"), strings.Contains(lower, "elements.envato.com"):
return "https://elements.envato.com/"
case strings.Contains(lower, "artgrid"), strings.Contains(lower, "artlist"):
return "https://artgrid.io/"
default:
return ""
}
}
func newBrowserRequest(method, target, accept, strategy string) (*http.Request, error) {
req, err := http.NewRequest(method, target, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if accept != "" {
req.Header.Set("Accept", accept)
}
if strategy == "provider" {
req.Header.Set("Referer", inferFetchReferer(target))
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("Sec-Fetch-Dest", "document")
req.Header.Set("Sec-Fetch-Mode", "navigate")
req.Header.Set("Sec-Fetch-Site", "none")
req.Header.Set("Sec-Fetch-User", "?1")
}
return req, nil
}
func fetchTextViaPython(target string) (string, error) {
script := `
from urllib.request import Request, urlopen
import sys
req = Request(sys.argv[1], headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Referer": sys.argv[2],
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
})
with urlopen(req, timeout=20) as resp:
sys.stdout.buffer.write(resp.read(1024 * 1024))
`
output, err := exec.Command("python3", "-c", script, target, inferFetchReferer(target)).CombinedOutput()
if err != nil {
return "", fmt.Errorf("python fallback failed: %v: %s", err, truncateBytes(output, 300))
}
return string(output), nil
}
func looksLikeCloudflareChallenge(body string) bool {
lower := strings.ToLower(body)
return strings.Contains(lower, "cf-mitigated") || strings.Contains(lower, "attention required") || strings.Contains(lower, "just a moment")
}
func truncateBytes(data []byte, limit int) string {
trimmed := strings.TrimSpace(string(data))
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func limitQueries(queries []string, limit int) []string {
seen := map[string]bool{}
filtered := make([]string, 0, minInt(len(queries), limit))
for _, item := range queries {
trimmed := strings.TrimSpace(item)
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
filtered = append(filtered, trimmed)
if len(filtered) >= limit {
break
}
}
return filtered
}
func limitCollectorQueries(collector string, queries []string, onlyMissing bool) []string {
limit := 2
switch collector {
case "Envato", "Artgrid":
limit = 4
case "Google Video":
limit = 3
}
if onlyMissing {
limit--
}
if limit < 1 {
limit = 1
}
return limitQueries(queries, limit)
}
func cloneSearchResults(items []SearchResult) []SearchResult {
if len(items) == 0 {
return []SearchResult{}
}
cloned := make([]SearchResult, len(items))
copy(cloned, items)
return cloned
}
func shuffleStrings(values []string) {
if len(values) < 2 {
return
}
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
rng.Shuffle(len(values), func(i, j int) {
values[i], values[j] = values[j], values[i]
})
}
func htmlUnescape(text string) string {
replacer := strings.NewReplacer("&amp;", "&", "&quot;", `"`, "&#39;", "'", "&lt;", "<", "&gt;", ">")
return replacer.Replace(text)
}
func sourceWeight(source string) int {
switch source {
case "Envato":
return 3
case "Artgrid":
return 2
case "Google Video":
return 1
default:
return 0
}
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}