1121 lines
34 KiB
Go
1121 lines
34 KiB
Go
package services
|
||
|
||
import (
|
||
"encoding/base64"
|
||
"encoding/json"
|
||
"fmt"
|
||
"io"
|
||
"math/rand"
|
||
"net/http"
|
||
"net/url"
|
||
"os/exec"
|
||
"regexp"
|
||
"sort"
|
||
"strings"
|
||
"sync"
|
||
"time"
|
||
)
|
||
|
||
type SearchResult struct {
|
||
Title string `json:"title"`
|
||
Link string `json:"link"`
|
||
DisplayLink string `json:"displayLink"`
|
||
Snippet string `json:"snippet"`
|
||
ThumbnailURL string `json:"thumbnailUrl"`
|
||
PreviewVideoURL string `json:"previewVideoUrl"`
|
||
Source string `json:"source"`
|
||
}
|
||
|
||
type SearchService struct {
|
||
BaseURL string
|
||
GoogleVideoEngine string
|
||
WebEngine string
|
||
Client *http.Client
|
||
collectors []searchCollector
|
||
Debug func(message string, data any)
|
||
}
|
||
|
||
func NewSearchService(baseURL, googleVideoEngine, webEngine string) *SearchService {
|
||
if googleVideoEngine == "" {
|
||
googleVideoEngine = "google videos"
|
||
}
|
||
if webEngine == "" {
|
||
webEngine = "google"
|
||
}
|
||
return &SearchService{
|
||
BaseURL: strings.TrimRight(baseURL, "/"),
|
||
GoogleVideoEngine: googleVideoEngine,
|
||
WebEngine: webEngine,
|
||
Client: &http.Client{Timeout: 20 * time.Second},
|
||
collectors: []searchCollector{
|
||
envatoCollector{},
|
||
artgridCollector{},
|
||
googleVideoCollector{},
|
||
},
|
||
}
|
||
}
|
||
|
||
func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[string]bool) ([]SearchResult, error) {
|
||
return s.SearchMediaWithDeadline(queries, enabledPlatforms, time.Time{})
|
||
}
|
||
|
||
func (s *SearchService) SearchMediaWithDeadline(queries []string, enabledPlatforms map[string]bool, deadline time.Time) ([]SearchResult, error) {
|
||
if s.BaseURL == "" {
|
||
return nil, fmt.Errorf("searxng base url is not configured")
|
||
}
|
||
s.debug("search_service:start", map[string]any{
|
||
"queries": queries,
|
||
"enabledPlatforms": enabledPlatforms,
|
||
})
|
||
|
||
seen := map[string]bool{}
|
||
sourceCounts := map[string]int{}
|
||
results := make([]SearchResult, 0, 90)
|
||
var lastErr error
|
||
|
||
baseQueries := limitQueries(queries, 6)
|
||
shuffleStrings(baseQueries)
|
||
primaryQueries := baseQueries[:minInt(len(baseQueries), 3)]
|
||
runSearchPass := func(bases []string, onlyMissing bool) {
|
||
for _, base := range bases {
|
||
if !deadline.IsZero() && time.Now().After(deadline) {
|
||
s.debug("search_service:deadline_reached", map[string]any{"stage": "runSearchPass", "base": base})
|
||
return
|
||
}
|
||
base = strings.TrimSpace(base)
|
||
if base == "" {
|
||
continue
|
||
}
|
||
for _, collector := range s.collectors {
|
||
if !deadline.IsZero() && time.Now().After(deadline) {
|
||
s.debug("search_service:deadline_reached", map[string]any{"stage": "collectorLoop", "collector": collector.Name()})
|
||
return
|
||
}
|
||
if !collector.Enabled(enabledPlatforms) {
|
||
continue
|
||
}
|
||
if sourceCounts[collector.Name()] >= collector.MaxResults() {
|
||
continue
|
||
}
|
||
if onlyMissing && sourceCounts[collector.Name()] > 0 {
|
||
continue
|
||
}
|
||
searchQueries := collector.BuildQueries(base)
|
||
shuffleStrings(searchQueries)
|
||
s.debug("search_service:collector_queries", map[string]any{
|
||
"collector": collector.Name(),
|
||
"base": base,
|
||
"onlyMissing": onlyMissing,
|
||
"searchQueries": searchQueries,
|
||
})
|
||
for _, searchQuery := range searchQueries {
|
||
if !deadline.IsZero() && time.Now().After(deadline) {
|
||
s.debug("search_service:deadline_reached", map[string]any{"stage": "queryLoop", "collector": collector.Name(), "query": searchQuery})
|
||
return
|
||
}
|
||
if sourceCounts[collector.Name()] >= collector.MaxResults() {
|
||
break
|
||
}
|
||
items, err := collector.Collect(s, searchQuery)
|
||
if err != nil {
|
||
s.debug("search_service:collector_error", map[string]any{
|
||
"collector": collector.Name(),
|
||
"query": searchQuery,
|
||
"error": err.Error(),
|
||
})
|
||
lastErr = err
|
||
continue
|
||
}
|
||
s.debug("search_service:collector_results", map[string]any{
|
||
"collector": collector.Name(),
|
||
"query": searchQuery,
|
||
"rawCount": len(items),
|
||
"sourceCount": sourceCounts[collector.Name()],
|
||
})
|
||
for _, item := range items {
|
||
item = normalizeResultForCollector(collector.Name(), item)
|
||
if item.Link == "" || seen[item.Link] || !collector.Accept(item) {
|
||
continue
|
||
}
|
||
seen[item.Link] = true
|
||
results = append(results, item)
|
||
sourceCounts[collector.Name()]++
|
||
if sourceCounts[collector.Name()] >= collector.MaxResults() {
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
runSearchPass(primaryQueries, false)
|
||
if len(baseQueries) > len(primaryQueries) {
|
||
runSearchPass(baseQueries[len(primaryQueries):], true)
|
||
}
|
||
|
||
if len(results) == 0 && lastErr != nil {
|
||
return nil, lastErr
|
||
}
|
||
|
||
sort.SliceStable(results, func(i, j int) bool {
|
||
return sourceWeight(results[i].Source) > sourceWeight(results[j].Source)
|
||
})
|
||
s.debug("search_service:complete", map[string]any{
|
||
"resultCount": len(results),
|
||
"sourceCounts": sourceCounts,
|
||
"hadError": lastErr != nil,
|
||
})
|
||
return s.EnrichResultsWithDeadline(results, deadline), nil
|
||
}
|
||
|
||
func (s *SearchService) EnrichResults(results []SearchResult) []SearchResult {
|
||
return s.EnrichResultsWithDeadline(results, time.Time{})
|
||
}
|
||
|
||
func (s *SearchService) EnrichResultsWithDeadline(results []SearchResult, deadline time.Time) []SearchResult {
|
||
limit := minInt(len(results), 14)
|
||
if limit == 0 {
|
||
return results
|
||
}
|
||
s.debug("search_service:enrich_start", map[string]any{
|
||
"total": len(results),
|
||
"limit": limit,
|
||
})
|
||
|
||
enriched := make([]SearchResult, len(results))
|
||
copy(enriched, results)
|
||
|
||
var wg sync.WaitGroup
|
||
sem := make(chan struct{}, 4)
|
||
for idx := 0; idx < limit; idx++ {
|
||
wg.Add(1)
|
||
go func(i int) {
|
||
defer wg.Done()
|
||
if !deadline.IsZero() && time.Now().After(deadline) {
|
||
return
|
||
}
|
||
sem <- struct{}{}
|
||
defer func() { <-sem }()
|
||
s.debug("search_service:enrich_item_start", map[string]any{
|
||
"index": i,
|
||
"link": enriched[i].Link,
|
||
"source": enriched[i].Source,
|
||
})
|
||
enriched[i] = s.enrichResult(enriched[i])
|
||
s.debug("search_service:enrich_item_done", map[string]any{
|
||
"index": i,
|
||
"link": enriched[i].Link,
|
||
"source": enriched[i].Source,
|
||
"thumbnail": strings.TrimSpace(enriched[i].ThumbnailURL) != "",
|
||
"preview": strings.TrimSpace(enriched[i].PreviewVideoURL) != "",
|
||
"title": truncateForDebug(enriched[i].Title, 120),
|
||
})
|
||
}(idx)
|
||
}
|
||
wg.Wait()
|
||
s.debug("search_service:enrich_complete", map[string]any{"limit": limit})
|
||
return enriched
|
||
}
|
||
|
||
func (s *SearchService) enrichResult(result SearchResult) SearchResult {
|
||
for _, collector := range s.collectors {
|
||
if collector.Name() == result.Source {
|
||
return collector.Enrich(s, result)
|
||
}
|
||
}
|
||
if result.ThumbnailURL == "" {
|
||
result.ThumbnailURL = deriveThumbnail(result.Link)
|
||
}
|
||
return result
|
||
}
|
||
|
||
func (s *SearchService) searchWithFallback(query, categories, engine, source string) ([]SearchResult, error) {
|
||
s.debug("search_service:search_with_fallback", map[string]any{
|
||
"query": query,
|
||
"categories": categories,
|
||
"engine": engine,
|
||
"source": source,
|
||
})
|
||
items, err := s.search(query, categories, engine, source)
|
||
if err == nil {
|
||
return items, nil
|
||
}
|
||
s.debug("search_service:search_with_fallback_primary_error", map[string]any{
|
||
"query": query,
|
||
"engine": engine,
|
||
"error": err.Error(),
|
||
})
|
||
if strings.TrimSpace(engine) == "" {
|
||
return nil, err
|
||
}
|
||
return s.search(query, categories, "", source)
|
||
}
|
||
|
||
func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
|
||
s.debug("search_service:enrich_envato_start", map[string]any{"link": result.Link})
|
||
html, err := s.fetchText(result.Link)
|
||
if err != nil {
|
||
s.debug("search_service:enrich_envato_fetch_error", map[string]any{"link": result.Link, "error": err.Error()})
|
||
return result
|
||
}
|
||
videoMeta := extractVideoObjectJSONLD(html)
|
||
result.Title = firstNonEmpty(
|
||
cleanEnvatoTitle(videoMeta.Name),
|
||
extractMetaContent(html, "og:title"),
|
||
result.Title,
|
||
)
|
||
result.Snippet = firstNonEmpty(
|
||
cleanEnvatoDescription(videoMeta.Description),
|
||
extractMetaContent(html, "og:description"),
|
||
extractMetaContent(html, "description"),
|
||
result.Snippet,
|
||
)
|
||
|
||
pageThumbnail := firstNonEmpty(
|
||
videoMeta.ThumbnailURL,
|
||
extractMetaContent(html, "og:image"),
|
||
extractMetaContent(html, "twitter:image"),
|
||
extractJSONLDValue(html, "thumbnailUrl"),
|
||
)
|
||
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
|
||
result.ThumbnailURL = pageThumbnail
|
||
}
|
||
if result.PreviewVideoURL == "" {
|
||
result.PreviewVideoURL = firstNonEmpty(
|
||
videoMeta.ContentURL,
|
||
extractJSONLDValue(html, "contentUrl"),
|
||
extractMetaContent(html, "twitter:player:stream"),
|
||
extractVideoPreviewURL(html),
|
||
extractEnvatoPreviewFromHydration(html),
|
||
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
|
||
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
|
||
)
|
||
}
|
||
if result.PreviewVideoURL == "" {
|
||
time.Sleep(1200 * time.Millisecond)
|
||
if retryHTML, retryErr := s.fetchText(result.Link); retryErr == nil {
|
||
result.PreviewVideoURL = firstNonEmpty(
|
||
extractJSONLDValue(retryHTML, "contentUrl"),
|
||
extractMetaContent(retryHTML, "twitter:player:stream"),
|
||
extractVideoPreviewURL(retryHTML),
|
||
extractEnvatoPreviewFromHydration(retryHTML),
|
||
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
|
||
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
|
||
)
|
||
}
|
||
}
|
||
s.debug("search_service:enrich_envato_done", map[string]any{
|
||
"link": result.Link,
|
||
"thumbnail": strings.TrimSpace(result.ThumbnailURL) != "",
|
||
"preview": strings.TrimSpace(result.PreviewVideoURL) != "",
|
||
})
|
||
return result
|
||
}
|
||
|
||
func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
|
||
clipID := extractArtgridClipID(result.Link)
|
||
if clipID == "" {
|
||
s.debug("search_service:enrich_artgrid_skip", map[string]any{"link": result.Link, "reason": "missing clip id"})
|
||
return result
|
||
}
|
||
s.debug("search_service:enrich_artgrid_start", map[string]any{"link": result.Link, "clipId": clipID})
|
||
|
||
apiURL := "https://artgrid.io/api/clip/details?clipId=" + clipID
|
||
body, err := s.fetchJSONText(apiURL)
|
||
if err == nil {
|
||
urls := collectURLs(body)
|
||
if result.ThumbnailURL == "" {
|
||
result.ThumbnailURL = pickImageURL(urls)
|
||
}
|
||
if result.PreviewVideoURL == "" {
|
||
result.PreviewVideoURL = pickVideoURL(urls)
|
||
}
|
||
}
|
||
if err != nil {
|
||
s.debug("search_service:enrich_artgrid_api_error", map[string]any{"link": result.Link, "clipId": clipID, "error": err.Error()})
|
||
}
|
||
|
||
if result.ThumbnailURL == "" || result.PreviewVideoURL == "" {
|
||
html, err := s.fetchText(result.Link)
|
||
if err == nil {
|
||
if !isMatchingArtgridClipPage(html, clipID) {
|
||
s.debug("search_service:enrich_artgrid_html_mismatch", map[string]any{"link": result.Link, "clipId": clipID})
|
||
return result
|
||
}
|
||
result.Title = firstNonEmpty(
|
||
cleanArtgridTitle(extractMetaContent(html, "og:title")),
|
||
cleanArtgridTitle(extractMetaContent(html, "title")),
|
||
result.Title,
|
||
)
|
||
result.Snippet = firstNonEmpty(
|
||
cleanArtgridDescription(extractMetaContent(html, "og:description")),
|
||
cleanArtgridDescription(extractMetaContent(html, "description")),
|
||
result.Snippet,
|
||
)
|
||
pageThumbnail := firstNonEmpty(
|
||
extractMetaContent(html, "og:image"),
|
||
extractMetaContent(html, "twitter:image"),
|
||
extractArtgridBackgroundThumbnail(html, clipID),
|
||
extractJSONLDValue(html, "image"),
|
||
)
|
||
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
|
||
result.ThumbnailURL = pageThumbnail
|
||
}
|
||
if result.PreviewVideoURL == "" {
|
||
result.PreviewVideoURL = firstNonEmpty(
|
||
extractJSONLDValue(html, "contentUrl"),
|
||
extractMetaContent(html, "twitter:player:stream"),
|
||
extractVideoPreviewURL(html),
|
||
)
|
||
}
|
||
if result.PreviewVideoURL == "" {
|
||
time.Sleep(1200 * time.Millisecond)
|
||
if retryHTML, retryErr := s.fetchText(result.Link); retryErr == nil {
|
||
result.PreviewVideoURL = firstNonEmpty(
|
||
extractJSONLDValue(retryHTML, "contentUrl"),
|
||
extractMetaContent(retryHTML, "twitter:player:stream"),
|
||
extractVideoPreviewURL(retryHTML),
|
||
)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
s.debug("search_service:enrich_artgrid_done", map[string]any{
|
||
"link": result.Link,
|
||
"clipId": clipID,
|
||
"thumbnail": strings.TrimSpace(result.ThumbnailURL) != "",
|
||
"preview": strings.TrimSpace(result.PreviewVideoURL) != "",
|
||
})
|
||
|
||
return result
|
||
}
|
||
|
||
func (s *SearchService) search(query, categories, engine, source string) ([]SearchResult, error) {
|
||
values := url.Values{}
|
||
values.Set("q", query)
|
||
values.Set("format", "json")
|
||
values.Set("safesearch", "0")
|
||
values.Set("language", "en-US")
|
||
if categories != "" {
|
||
values.Set("categories", categories)
|
||
}
|
||
if engine != "" {
|
||
values.Set("engines", engine)
|
||
}
|
||
|
||
endpoint := s.BaseURL + "/search?" + values.Encode()
|
||
s.debug("search_service:searx_request", map[string]any{
|
||
"endpoint": endpoint,
|
||
"query": query,
|
||
"categories": categories,
|
||
"engine": engine,
|
||
"source": source,
|
||
})
|
||
resp, err := s.Client.Get(endpoint)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode >= 300 {
|
||
return nil, fmt.Errorf("searxng returned status %d for query %q", resp.StatusCode, query)
|
||
}
|
||
|
||
var payload struct {
|
||
Results []struct {
|
||
Title string `json:"title"`
|
||
URL string `json:"url"`
|
||
Content string `json:"content"`
|
||
Thumbnail string `json:"thumbnail"`
|
||
ThumbnailSrc string `json:"thumbnail_src"`
|
||
ImgSrc string `json:"img_src"`
|
||
ParsedURL []any `json:"parsed_url"`
|
||
Engine string `json:"engine"`
|
||
} `json:"results"`
|
||
}
|
||
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
||
return nil, fmt.Errorf("searxng JSON decode failed for query %q: %w", query, err)
|
||
}
|
||
s.debug("search_service:searx_response", map[string]any{
|
||
"query": query,
|
||
"source": source,
|
||
"rawCount": len(payload.Results),
|
||
})
|
||
|
||
results := make([]SearchResult, 0, len(payload.Results))
|
||
for _, item := range payload.Results {
|
||
link := strings.TrimSpace(item.URL)
|
||
if link == "" {
|
||
continue
|
||
}
|
||
results = append(results, SearchResult{
|
||
Title: item.Title,
|
||
Link: link,
|
||
DisplayLink: inferDisplayLink(link, item.ParsedURL),
|
||
Snippet: item.Content,
|
||
ThumbnailURL: firstNonEmpty(item.Thumbnail, item.ThumbnailSrc, item.ImgSrc, deriveThumbnail(link)),
|
||
Source: normalizeSource(source, link, item.Engine),
|
||
})
|
||
}
|
||
return results, nil
|
||
}
|
||
|
||
func (s *SearchService) debug(message string, data any) {
|
||
if s != nil && s.Debug != nil {
|
||
s.Debug(message, data)
|
||
}
|
||
}
|
||
|
||
func truncateForDebug(text string, limit int) string {
|
||
trimmed := strings.TrimSpace(text)
|
||
if len(trimmed) <= limit {
|
||
return trimmed
|
||
}
|
||
return trimmed[:limit] + "..."
|
||
}
|
||
|
||
func buildGoogleVideoQueries(base string) []string {
|
||
return []string{
|
||
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR "establishing shot" OR editorial) -tutorial -"how to" -review -reaction -course -podcast -vlog -interview -breakdown -edit -editing`, base),
|
||
fmt.Sprintf(`"%s" ("cinematic b-roll" OR "establishing shot" OR "drone footage" OR "urban footage") -tutorial -reaction -vlog -podcast`, base),
|
||
fmt.Sprintf(`"%s" ("night drive" OR "city footage" OR "street footage" OR "editorial footage") -tutorial -review -music`, base),
|
||
}
|
||
}
|
||
|
||
func buildEnvatoQueries(base string) []string {
|
||
return []string{
|
||
fmt.Sprintf(`"%s" ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com`, base),
|
||
fmt.Sprintf(`"%s" ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com/stock-video`, base),
|
||
fmt.Sprintf(`"%s" ("motion graphics" OR "backgrounds" OR "establishing shot" OR "loop") site:elements.envato.com`, base),
|
||
fmt.Sprintf(`"%s" ("urban" OR "night city" OR "cyberpunk" OR "sci-fi") site:elements.envato.com`, base),
|
||
}
|
||
}
|
||
|
||
func buildArtgridQueries(base string) []string {
|
||
return []string{
|
||
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR editorial) site:artgrid.io/clip/`, base),
|
||
fmt.Sprintf(`"%s" ("footage" OR "cinematic" OR "establishing shot") site:artgrid.io/clip/`, base),
|
||
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR editorial) site:artlist.io/stock-footage/clip/`, base),
|
||
fmt.Sprintf(`"%s" ("footage" OR "cinematic" OR "establishing shot") site:artlist.io/stock-footage/clip/`, base),
|
||
fmt.Sprintf(`"%s" ("night drive" OR "urban night" OR "wet road" OR "cyberpunk") site:artgrid.io/clip/`, base),
|
||
fmt.Sprintf(`"%s" ("drone" OR "city skyline" OR "street scene" OR "mood shot") site:artlist.io/stock-footage/clip/`, base),
|
||
}
|
||
}
|
||
|
||
func isUsefulGoogleVideoResult(result SearchResult) bool {
|
||
lowerLink := strings.ToLower(result.Link)
|
||
if !(strings.Contains(lowerLink, "youtube.com/watch") || strings.Contains(lowerLink, "youtu.be/") || strings.Contains(lowerLink, "youtube.com/shorts/")) {
|
||
return false
|
||
}
|
||
text := strings.ToLower(result.Title + " " + result.Snippet)
|
||
for _, banned := range []string{
|
||
"tutorial", "how to", "review", "reaction", "podcast", "interview", "walkthrough",
|
||
"course", "lesson", "edit tutorial", "editing tutorial", "premiere pro", "after effects",
|
||
"breakdown", "explained", "vlog", "tips", "guide", "learn", "free download",
|
||
"bgm", "music", "song", "lyrics", "audio", "soundtrack", "trailer", "teaser",
|
||
"full movie", "movie clip", "status", "whatsapp status", "fan cam", "fancam",
|
||
} {
|
||
if strings.Contains(text, banned) {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
func isRenderableEnvatoResult(result SearchResult) bool {
|
||
parsed, err := url.Parse(result.Link)
|
||
if err != nil {
|
||
return false
|
||
}
|
||
host := strings.ToLower(parsed.Host)
|
||
path := strings.Trim(parsed.Path, "/")
|
||
if strings.Contains(host, "elements.envato.com") {
|
||
if path == "" || strings.Contains(path, "/stock-video") || strings.Contains(path, "/video-templates") {
|
||
return false
|
||
}
|
||
return regexp.MustCompile(`-[A-Z0-9]{6,}$`).MatchString(path)
|
||
}
|
||
return false
|
||
}
|
||
|
||
func isRenderableArtgridResult(result SearchResult) bool {
|
||
parsed, err := url.Parse(result.Link)
|
||
if err != nil {
|
||
return false
|
||
}
|
||
host := strings.ToLower(parsed.Host)
|
||
switch {
|
||
case strings.Contains(host, "artgrid.io"):
|
||
return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path)
|
||
case strings.Contains(host, "artlist.io"):
|
||
trimmedPath := strings.TrimSuffix(parsed.Path, "/")
|
||
return regexp.MustCompile(`^/stock-footage/clip/.+/[0-9]+$`).MatchString(trimmedPath)
|
||
default:
|
||
return false
|
||
}
|
||
}
|
||
|
||
func normalizeSource(source, link, engine string) string {
|
||
switch {
|
||
case source != "":
|
||
return source
|
||
case strings.Contains(strings.ToLower(link), "envato") || strings.Contains(strings.ToLower(link), "videohive"):
|
||
return "Envato"
|
||
case strings.Contains(strings.ToLower(link), "artgrid"), strings.Contains(strings.ToLower(link), "artlist.io/stock-footage/clip/"):
|
||
return "Artgrid"
|
||
case strings.Contains(strings.ToLower(engine), "google"):
|
||
return "Google Video"
|
||
default:
|
||
return engine
|
||
}
|
||
}
|
||
|
||
func inferDisplayLink(link string, parsed []any) string {
|
||
if len(parsed) > 1 {
|
||
if host, ok := parsed[1].(string); ok {
|
||
return host
|
||
}
|
||
}
|
||
if parsedURL, err := url.Parse(link); err == nil {
|
||
return parsedURL.Host
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func deriveThumbnail(link string) string {
|
||
if videoID := extractYouTubeID(link); videoID != "" {
|
||
return "https://i.ytimg.com/vi/" + videoID + "/hqdefault.jpg"
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func extractYouTubeID(link string) string {
|
||
patterns := []*regexp.Regexp{
|
||
regexp.MustCompile(`(?:v=|\/shorts\/|\/embed\/)([A-Za-z0-9_-]{11})`),
|
||
regexp.MustCompile(`youtu\.be\/([A-Za-z0-9_-]{11})`),
|
||
}
|
||
for _, pattern := range patterns {
|
||
matches := pattern.FindStringSubmatch(link)
|
||
if len(matches) == 2 {
|
||
return matches[1]
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func extractMetaContent(html, property string) string {
|
||
patterns := []*regexp.Regexp{
|
||
regexp.MustCompile(`(?i)<meta[^>]+property=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
|
||
regexp.MustCompile(`(?i)<meta[^>]+name=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
|
||
}
|
||
for _, pattern := range patterns {
|
||
matches := pattern.FindStringSubmatch(html)
|
||
if len(matches) == 2 {
|
||
return htmlUnescape(matches[1])
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func extractVideoPreviewURL(html string) string {
|
||
normalizedHTML := strings.ReplaceAll(html, `\\\/`, `/`)
|
||
normalizedHTML = strings.ReplaceAll(normalizedHTML, `\/`, `/`)
|
||
normalizedHTML = strings.ReplaceAll(normalizedHTML, `\u002F`, `/`)
|
||
pattern := regexp.MustCompile(`https?://[^"'[:space:]>]+(?:mp4|m3u8)(?:\?[^"'[:space:]>]*)?`)
|
||
matches := pattern.FindAllString(normalizedHTML, -1)
|
||
for _, match := range matches {
|
||
candidate := strings.TrimSpace(strings.ReplaceAll(match, `\\`, ""))
|
||
if strings.Contains(strings.ToLower(candidate), "preview") || strings.Contains(strings.ToLower(candidate), "video") || strings.Contains(strings.ToLower(candidate), "watermark") {
|
||
return candidate
|
||
}
|
||
}
|
||
for _, match := range matches {
|
||
candidate := strings.TrimSpace(strings.ReplaceAll(match, `\\`, ""))
|
||
if strings.TrimSpace(candidate) != "" {
|
||
return candidate
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func extractArtgridBackgroundThumbnail(html, clipID string) string {
|
||
pattern := regexp.MustCompile(`https://[^"'\\s>]+(?:artgrid\.imgix\.net|cms-public-artifacts\.artlist\.io|artlist-content-images\.imgix\.net)[^"'\\s>]+(?:jpeg|jpg|png|webp)`)
|
||
matches := pattern.FindAllString(html, -1)
|
||
for _, match := range matches {
|
||
if strings.Contains(match, clipID) || strings.Contains(strings.ToLower(match), "graded-thumbnail") {
|
||
return match
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func extractArtgridClipID(link string) string {
|
||
patterns := []*regexp.Regexp{
|
||
regexp.MustCompile(`/clip/([0-9]+)/`),
|
||
regexp.MustCompile(`/stock-footage/clip/[^/]+/([0-9]+)$`),
|
||
regexp.MustCompile(`/stock-footage/clip/.+/([0-9]+)$`),
|
||
}
|
||
for _, pattern := range patterns {
|
||
matches := pattern.FindStringSubmatch(strings.TrimSuffix(link, "/"))
|
||
if len(matches) == 2 {
|
||
return matches[1]
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func canonicalizeArtgridLink(link string) string {
|
||
trimmed := strings.TrimSpace(link)
|
||
if trimmed == "" {
|
||
return ""
|
||
}
|
||
clipID := extractArtgridClipID(trimmed)
|
||
if clipID == "" {
|
||
return trimmed
|
||
}
|
||
if strings.Contains(strings.ToLower(trimmed), "artgrid.io/clip/") {
|
||
return trimmed
|
||
}
|
||
parsed, err := url.Parse(trimmed)
|
||
if err != nil {
|
||
return trimmed
|
||
}
|
||
segments := strings.Split(strings.Trim(parsed.Path, "/"), "/")
|
||
slug := clipID
|
||
for idx, segment := range segments {
|
||
if segment == clipID && idx > 0 {
|
||
slug = segments[idx-1]
|
||
break
|
||
}
|
||
}
|
||
return "https://artgrid.io/clip/" + clipID + "/" + slug
|
||
}
|
||
|
||
func normalizeResultForCollector(source string, result SearchResult) SearchResult {
|
||
switch source {
|
||
case "Artgrid":
|
||
result.Link = canonicalizeArtgridLink(result.Link)
|
||
result.Source = "Artgrid"
|
||
case "Envato":
|
||
result.Source = "Envato"
|
||
case "Google Video":
|
||
result.Source = "Google Video"
|
||
}
|
||
return result
|
||
}
|
||
|
||
func collectURLs(body string) []string {
|
||
pattern := regexp.MustCompile(`https?:\/\/[^"'\\\s]+`)
|
||
matches := pattern.FindAllString(body, -1)
|
||
seen := map[string]bool{}
|
||
results := make([]string, 0, len(matches))
|
||
for _, match := range matches {
|
||
candidate := strings.TrimSpace(strings.Trim(match, `"'`))
|
||
if candidate == "" || seen[candidate] {
|
||
continue
|
||
}
|
||
seen[candidate] = true
|
||
results = append(results, candidate)
|
||
}
|
||
return results
|
||
}
|
||
|
||
func pickImageURL(urls []string) string {
|
||
for _, item := range urls {
|
||
lower := strings.ToLower(item)
|
||
if strings.Contains(lower, ".jpg") || strings.Contains(lower, ".jpeg") || strings.Contains(lower, ".png") || strings.Contains(lower, ".webp") {
|
||
return item
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func pickVideoURL(urls []string) string {
|
||
for _, item := range urls {
|
||
lower := strings.ToLower(item)
|
||
if strings.Contains(lower, ".m3u8") && (strings.Contains(lower, "artgrid") || strings.Contains(lower, "artlist") || strings.Contains(lower, "cdn")) {
|
||
return item
|
||
}
|
||
}
|
||
for _, item := range urls {
|
||
lower := strings.ToLower(item)
|
||
if strings.Contains(lower, ".mp4") || strings.Contains(lower, ".m3u8") {
|
||
return item
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func (s *SearchService) fetchText(target string) (string, error) {
|
||
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
resp, err := s.Client.Do(req)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
defer resp.Body.Close()
|
||
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
|
||
return fetchTextViaPython(target)
|
||
}
|
||
if resp.StatusCode >= 300 {
|
||
return "", fmt.Errorf("fetch returned status %d", resp.StatusCode)
|
||
}
|
||
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if looksLikeCloudflareChallenge(string(data)) {
|
||
return fetchTextViaPython(target)
|
||
}
|
||
return string(data), nil
|
||
}
|
||
|
||
func (s *SearchService) fetchJSONText(target string) (string, error) {
|
||
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*")
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
resp, err := s.Client.Do(req)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
defer resp.Body.Close()
|
||
if resp.StatusCode >= 300 {
|
||
return "", fmt.Errorf("json fetch returned status %d", resp.StatusCode)
|
||
}
|
||
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
return string(data), nil
|
||
}
|
||
|
||
func firstNonEmpty(values ...string) string {
|
||
for _, value := range values {
|
||
if strings.TrimSpace(value) != "" {
|
||
return value
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func shouldPreferPageThumbnail(current, pageLink string) bool {
|
||
current = strings.TrimSpace(current)
|
||
if current == "" {
|
||
return true
|
||
}
|
||
lower := strings.ToLower(current)
|
||
if strings.Contains(lower, "imgs.search.brave.com") || strings.Contains(lower, "googleusercontent.com") || strings.Contains(lower, "bing.com") {
|
||
return true
|
||
}
|
||
currentHost := hostOf(current)
|
||
pageHost := hostOf(pageLink)
|
||
return currentHost == "" || (pageHost != "" && currentHost != pageHost)
|
||
}
|
||
|
||
func hostOf(raw string) string {
|
||
parsed, err := url.Parse(raw)
|
||
if err != nil {
|
||
return ""
|
||
}
|
||
return strings.ToLower(parsed.Host)
|
||
}
|
||
|
||
func extractJSONLDValue(html, key string) string {
|
||
pattern := regexp.MustCompile(`"` + regexp.QuoteMeta(key) + `"\s*:\s*"(https?:\\?/\\?/[^"]+|[^"]+)"`)
|
||
matches := pattern.FindAllStringSubmatch(html, -1)
|
||
for _, match := range matches {
|
||
if len(match) != 2 {
|
||
continue
|
||
}
|
||
value := strings.ReplaceAll(match[1], `\/`, `/`)
|
||
value = strings.ReplaceAll(value, `\u002F`, `/`)
|
||
value = strings.ReplaceAll(value, `\\`, "")
|
||
value = htmlUnescape(value)
|
||
if strings.TrimSpace(value) != "" {
|
||
return value
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
type videoObjectMetadata struct {
|
||
Name string
|
||
Description string
|
||
ThumbnailURL string
|
||
ContentURL string
|
||
}
|
||
|
||
func extractVideoObjectJSONLD(html string) videoObjectMetadata {
|
||
pattern := regexp.MustCompile(`(?is)<script[^>]+type=["']application/ld\+json["'][^>]*>(.*?)</script>`)
|
||
matches := pattern.FindAllStringSubmatch(html, -1)
|
||
for _, match := range matches {
|
||
if len(match) != 2 {
|
||
continue
|
||
}
|
||
var payload map[string]any
|
||
if err := json.Unmarshal([]byte(htmlUnescape(strings.TrimSpace(match[1]))), &payload); err != nil {
|
||
continue
|
||
}
|
||
typeName, _ := payload["@type"].(string)
|
||
if !strings.EqualFold(typeName, "VideoObject") {
|
||
continue
|
||
}
|
||
meta := videoObjectMetadata{
|
||
Name: stringValue(payload["name"]),
|
||
Description: stringValue(payload["description"]),
|
||
ThumbnailURL: stringValue(payload["thumbnailUrl"]),
|
||
ContentURL: stringValue(payload["contentUrl"]),
|
||
}
|
||
if meta.Name != "" || meta.Description != "" || meta.ThumbnailURL != "" || meta.ContentURL != "" {
|
||
return meta
|
||
}
|
||
}
|
||
return videoObjectMetadata{}
|
||
}
|
||
|
||
func stringValue(value any) string {
|
||
switch typed := value.(type) {
|
||
case string:
|
||
return htmlUnescape(strings.TrimSpace(typed))
|
||
case []any:
|
||
for _, item := range typed {
|
||
if text := stringValue(item); text != "" {
|
||
return text
|
||
}
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func cleanEnvatoTitle(title string) string {
|
||
title = htmlUnescape(strings.TrimSpace(title))
|
||
return strings.TrimSuffix(title, " - Envato")
|
||
}
|
||
|
||
func cleanEnvatoDescription(description string) string {
|
||
description = htmlUnescape(strings.TrimSpace(description))
|
||
description = strings.ReplaceAll(description, "&", "&")
|
||
return description
|
||
}
|
||
|
||
func cleanArtgridTitle(title string) string {
|
||
title = htmlUnescape(strings.TrimSpace(title))
|
||
replacements := []string{
|
||
" | Stock Video Footage * Artgrid.io*",
|
||
" | Stock Video Footage - Artgrid.io",
|
||
" | Royalty Free Stock Footage – Artgrid.io",
|
||
" | Royalty Free Stock Footage - Artgrid.io",
|
||
}
|
||
for _, suffix := range replacements {
|
||
title = strings.TrimSuffix(title, suffix)
|
||
}
|
||
if idx := strings.Index(title, " by "); idx > 0 {
|
||
title = title[:idx]
|
||
}
|
||
return strings.TrimSpace(title)
|
||
}
|
||
|
||
func cleanArtgridDescription(description string) string {
|
||
description = htmlUnescape(strings.TrimSpace(description))
|
||
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid.")
|
||
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid")
|
||
if parts := strings.SplitN(description, " | ", 2); len(parts) == 2 {
|
||
description = parts[1]
|
||
}
|
||
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage – Artgrid.io")
|
||
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage - Artgrid.io")
|
||
return strings.TrimSpace(description)
|
||
}
|
||
|
||
func isMatchingArtgridClipPage(html, clipID string) bool {
|
||
if clipID == "" {
|
||
return false
|
||
}
|
||
ogURL := extractMetaContent(html, "og:url")
|
||
canonical := extractCanonicalURL(html)
|
||
lowerHTML := strings.ToLower(html)
|
||
for _, candidate := range []string{ogURL, canonical} {
|
||
if strings.Contains(candidate, clipID) {
|
||
return true
|
||
}
|
||
}
|
||
if strings.Contains(lowerHTML, "main-clipvideo_"+clipID) || strings.Contains(lowerHTML, "/clip/"+clipID+"/") {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
func extractCanonicalURL(html string) string {
|
||
pattern := regexp.MustCompile(`(?i)<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)`)
|
||
matches := pattern.FindStringSubmatch(html)
|
||
if len(matches) == 2 {
|
||
return htmlUnescape(matches[1])
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
|
||
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
|
||
if candidate == "" {
|
||
return ""
|
||
}
|
||
candidate = strings.ReplaceAll(candidate, "&", "&")
|
||
if strings.Contains(candidate, "/video_preview/") {
|
||
if idx := strings.Index(candidate, "?"); idx >= 0 {
|
||
candidate = candidate[:idx]
|
||
}
|
||
return regexp.MustCompile(`/video_preview/[^/]+\.(?:jpg|jpeg|png|webp)$`).ReplaceAllString(candidate, `/watermarked_preview/watermarked_preview.mp4`)
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func extractEnvatoPreviewFromHydration(html string) string {
|
||
encoded := extractWindowAssignedValue(html, "INITIAL_HYDRATION_DATA")
|
||
if encoded == "" {
|
||
return ""
|
||
}
|
||
|
||
decoded, err := base64.StdEncoding.DecodeString(encoded)
|
||
if err != nil {
|
||
return ""
|
||
}
|
||
urls := collectURLs(string(decoded))
|
||
return firstNonEmpty(pickBestEnvatoPreviewURL(urls), extractVideoPreviewURL(string(decoded)))
|
||
}
|
||
|
||
func extractWindowAssignedValue(html, variable string) string {
|
||
pattern := regexp.MustCompile(`window\.` + regexp.QuoteMeta(variable) + `\s*=\s*"([^"]+)"`)
|
||
matches := pattern.FindStringSubmatch(html)
|
||
if len(matches) == 2 {
|
||
return matches[1]
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func pickBestEnvatoPreviewURL(urls []string) string {
|
||
for _, item := range urls {
|
||
lower := strings.ToLower(item)
|
||
if strings.Contains(lower, "envatousercontent.com") && strings.HasSuffix(lower, ".mp4") {
|
||
return item
|
||
}
|
||
}
|
||
for _, item := range urls {
|
||
lower := strings.ToLower(item)
|
||
if strings.Contains(lower, "video-previews.elements.envatousercontent.com") && strings.Contains(lower, "watermarked_preview") && strings.HasSuffix(lower, ".mp4") {
|
||
return item
|
||
}
|
||
}
|
||
for _, item := range urls {
|
||
lower := strings.ToLower(item)
|
||
if strings.Contains(lower, "envatousercontent.com") && strings.Contains(lower, "watermarked_preview") && strings.HasSuffix(lower, ".mp4") {
|
||
return item
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func newBrowserRequest(method, target, accept string) (*http.Request, error) {
|
||
req, err := http.NewRequest(method, target, nil)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
|
||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||
if accept != "" {
|
||
req.Header.Set("Accept", accept)
|
||
}
|
||
return req, nil
|
||
}
|
||
|
||
func fetchTextViaPython(target string) (string, error) {
|
||
script := `
|
||
from urllib.request import Request, urlopen
|
||
import sys
|
||
req = Request(sys.argv[1], headers={
|
||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
})
|
||
with urlopen(req, timeout=20) as resp:
|
||
sys.stdout.buffer.write(resp.read(1024 * 1024))
|
||
`
|
||
output, err := exec.Command("python3", "-c", script, target).CombinedOutput()
|
||
if err != nil {
|
||
return "", fmt.Errorf("python fallback failed: %v: %s", err, truncateBytes(output, 300))
|
||
}
|
||
return string(output), nil
|
||
}
|
||
|
||
func looksLikeCloudflareChallenge(body string) bool {
|
||
lower := strings.ToLower(body)
|
||
return strings.Contains(lower, "cf-mitigated") || strings.Contains(lower, "attention required") || strings.Contains(lower, "just a moment")
|
||
}
|
||
|
||
func truncateBytes(data []byte, limit int) string {
|
||
trimmed := strings.TrimSpace(string(data))
|
||
if len(trimmed) <= limit {
|
||
return trimmed
|
||
}
|
||
return trimmed[:limit] + "..."
|
||
}
|
||
|
||
func limitQueries(queries []string, limit int) []string {
|
||
seen := map[string]bool{}
|
||
filtered := make([]string, 0, minInt(len(queries), limit))
|
||
for _, item := range queries {
|
||
trimmed := strings.TrimSpace(item)
|
||
if trimmed == "" {
|
||
continue
|
||
}
|
||
key := strings.ToLower(trimmed)
|
||
if seen[key] {
|
||
continue
|
||
}
|
||
seen[key] = true
|
||
filtered = append(filtered, trimmed)
|
||
if len(filtered) >= limit {
|
||
break
|
||
}
|
||
}
|
||
return filtered
|
||
}
|
||
|
||
func shuffleStrings(values []string) {
|
||
if len(values) < 2 {
|
||
return
|
||
}
|
||
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||
rng.Shuffle(len(values), func(i, j int) {
|
||
values[i], values[j] = values[j], values[i]
|
||
})
|
||
}
|
||
|
||
func htmlUnescape(text string) string {
|
||
replacer := strings.NewReplacer("&", "&", """, `"`, "'", "'", "<", "<", ">", ">")
|
||
return replacer.Replace(text)
|
||
}
|
||
|
||
func sourceWeight(source string) int {
|
||
switch source {
|
||
case "Envato":
|
||
return 3
|
||
case "Artgrid":
|
||
return 2
|
||
case "Google Video":
|
||
return 1
|
||
default:
|
||
return 0
|
||
}
|
||
}
|
||
|
||
func minInt(a, b int) int {
|
||
if a < b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|