Files
ai-media-hub/backend/services/cse.go
T
AI Assistant b78865d4bf
build-push / docker (push) Successful in 4m6s
Rewrite search flow and enrich preview assets
2026-03-13 12:50:25 +09:00

552 lines
14 KiB
Go

package services
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"sort"
"strings"
"sync"
"time"
)
type SearchResult struct {
Title string `json:"title"`
Link string `json:"link"`
DisplayLink string `json:"displayLink"`
Snippet string `json:"snippet"`
ThumbnailURL string `json:"thumbnailUrl"`
PreviewVideoURL string `json:"previewVideoUrl"`
Source string `json:"source"`
}
type SearchService struct {
BaseURL string
GoogleVideoEngine string
WebEngine string
Client *http.Client
}
func NewSearchService(baseURL, googleVideoEngine, webEngine string) *SearchService {
if googleVideoEngine == "" {
googleVideoEngine = "google videos"
}
if webEngine == "" {
webEngine = "google"
}
return &SearchService{
BaseURL: strings.TrimRight(baseURL, "/"),
GoogleVideoEngine: googleVideoEngine,
WebEngine: webEngine,
Client: &http.Client{Timeout: 20 * time.Second},
}
}
func (s *SearchService) SearchMedia(queries []string) ([]SearchResult, error) {
if s.BaseURL == "" {
return nil, fmt.Errorf("searxng base url is not configured")
}
type sourceConfig struct {
name string
categories string
engine string
build func(string) []string
accept func(SearchResult) bool
}
sources := []sourceConfig{
{
name: "Envato",
categories: "general",
engine: s.WebEngine,
build: buildEnvatoQueries,
accept: isRenderableEnvatoResult,
},
{
name: "Artgrid",
categories: "general",
engine: s.WebEngine,
build: buildArtgridQueries,
accept: isRenderableArtgridResult,
},
{
name: "Google Video",
categories: "videos",
engine: s.GoogleVideoEngine,
build: buildGoogleVideoQueries,
accept: isUsefulGoogleVideoResult,
},
}
seen := map[string]bool{}
results := make([]SearchResult, 0, 90)
var lastErr error
baseQueries := limitQueries(queries, 5)
for _, base := range baseQueries {
base = strings.TrimSpace(base)
if base == "" {
continue
}
for _, source := range sources {
for _, searchQuery := range source.build(base) {
items, err := s.search(searchQuery, source.categories, source.engine, source.name)
if err != nil {
lastErr = err
items, err = s.search(searchQuery, source.categories, "", source.name)
}
if err != nil {
lastErr = err
continue
}
for _, item := range items {
if item.Link == "" || seen[item.Link] || !source.accept(item) {
continue
}
seen[item.Link] = true
results = append(results, item)
}
}
}
}
if len(results) == 0 && lastErr != nil {
return nil, lastErr
}
sort.SliceStable(results, func(i, j int) bool {
return sourceWeight(results[i].Source) > sourceWeight(results[j].Source)
})
return s.EnrichResults(results), nil
}
func (s *SearchService) EnrichResults(results []SearchResult) []SearchResult {
limit := minInt(len(results), 24)
if limit == 0 {
return results
}
enriched := make([]SearchResult, len(results))
copy(enriched, results)
var wg sync.WaitGroup
sem := make(chan struct{}, 4)
for idx := 0; idx < limit; idx++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
enriched[i] = s.enrichResult(enriched[i])
}(idx)
}
wg.Wait()
return enriched
}
func (s *SearchService) enrichResult(result SearchResult) SearchResult {
switch result.Source {
case "Envato":
return s.enrichEnvato(result)
case "Artgrid":
return s.enrichArtgrid(result)
default:
if result.ThumbnailURL == "" {
result.ThumbnailURL = deriveThumbnail(result.Link)
}
return result
}
}
func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
html, err := s.fetchText(result.Link)
if err != nil {
return result
}
if result.ThumbnailURL == "" {
result.ThumbnailURL = firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
)
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = extractVideoPreviewURL(html)
}
return result
}
func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
clipID := extractArtgridClipID(result.Link)
if clipID == "" {
return result
}
apiURL := "https://artgrid.io/api/clip/details?clipId=" + clipID
body, err := s.fetchJSONText(apiURL)
if err == nil {
urls := collectURLs(body)
if result.ThumbnailURL == "" {
result.ThumbnailURL = pickImageURL(urls)
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = pickVideoURL(urls)
}
}
if result.ThumbnailURL == "" || result.PreviewVideoURL == "" {
html, err := s.fetchText(result.Link)
if err == nil {
if result.ThumbnailURL == "" {
result.ThumbnailURL = firstNonEmpty(
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
)
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = extractVideoPreviewURL(html)
}
}
}
return result
}
func (s *SearchService) search(query, categories, engine, source string) ([]SearchResult, error) {
values := url.Values{}
values.Set("q", query)
values.Set("format", "json")
values.Set("safesearch", "0")
values.Set("language", "en-US")
if categories != "" {
values.Set("categories", categories)
}
if engine != "" {
values.Set("engines", engine)
}
endpoint := s.BaseURL + "/search?" + values.Encode()
resp, err := s.Client.Get(endpoint)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return nil, fmt.Errorf("searxng returned status %d for query %q", resp.StatusCode, query)
}
var payload struct {
Results []struct {
Title string `json:"title"`
URL string `json:"url"`
Content string `json:"content"`
Thumbnail string `json:"thumbnail"`
ThumbnailSrc string `json:"thumbnail_src"`
ImgSrc string `json:"img_src"`
ParsedURL []any `json:"parsed_url"`
Engine string `json:"engine"`
} `json:"results"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("searxng JSON decode failed for query %q: %w", query, err)
}
results := make([]SearchResult, 0, len(payload.Results))
for _, item := range payload.Results {
link := strings.TrimSpace(item.URL)
if link == "" {
continue
}
results = append(results, SearchResult{
Title: item.Title,
Link: link,
DisplayLink: inferDisplayLink(link, item.ParsedURL),
Snippet: item.Content,
ThumbnailURL: firstNonEmpty(item.Thumbnail, item.ThumbnailSrc, item.ImgSrc, deriveThumbnail(link)),
Source: normalizeSource(source, link, item.Engine),
})
}
return results, nil
}
func buildGoogleVideoQueries(base string) []string {
return []string{
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR "establishing shot" OR editorial) -tutorial -"how to" -review -reaction -course -podcast -vlog -interview -breakdown -edit -editing`, base),
fmt.Sprintf(`"%s" ("cinematic footage" OR "free stock footage" OR "4k footage") -tutorial -"how to" -review`, base),
}
}
func buildEnvatoQueries(base string) []string {
return []string{
fmt.Sprintf(`"%s" ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com`, base),
fmt.Sprintf(`"%s" ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:videohive.net/item`, base),
}
}
func buildArtgridQueries(base string) []string {
return []string{
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR editorial) site:artgrid.io/clip/`, base),
fmt.Sprintf(`"%s" ("footage" OR "cinematic" OR "establishing shot") site:artgrid.io/clip/`, base),
}
}
func isUsefulGoogleVideoResult(result SearchResult) bool {
text := strings.ToLower(result.Title + " " + result.Snippet)
for _, banned := range []string{
"tutorial", "how to", "review", "reaction", "podcast", "interview", "walkthrough",
"course", "lesson", "edit tutorial", "editing tutorial", "premiere pro", "after effects",
"breakdown", "explained", "vlog", "tips", "guide", "learn", "free download",
} {
if strings.Contains(text, banned) {
return false
}
}
return true
}
func isRenderableEnvatoResult(result SearchResult) bool {
parsed, err := url.Parse(result.Link)
if err != nil {
return false
}
host := strings.ToLower(parsed.Host)
path := strings.Trim(parsed.Path, "/")
if strings.Contains(host, "videohive.net") {
return strings.HasPrefix(path, "item/") && len(strings.Split(path, "/")) >= 2
}
if strings.Contains(host, "elements.envato.com") {
if path == "" || strings.Contains(path, "/") {
return false
}
return regexp.MustCompile(`-[A-Z0-9]{6,}$`).MatchString(path)
}
return false
}
func isRenderableArtgridResult(result SearchResult) bool {
parsed, err := url.Parse(result.Link)
if err != nil {
return false
}
if !strings.Contains(strings.ToLower(parsed.Host), "artgrid.io") {
return false
}
return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path)
}
func normalizeSource(source, link, engine string) string {
switch {
case source != "":
return source
case strings.Contains(strings.ToLower(link), "envato") || strings.Contains(strings.ToLower(link), "videohive"):
return "Envato"
case strings.Contains(strings.ToLower(link), "artgrid"):
return "Artgrid"
case strings.Contains(strings.ToLower(engine), "google"):
return "Google Video"
default:
return engine
}
}
func inferDisplayLink(link string, parsed []any) string {
if len(parsed) > 1 {
if host, ok := parsed[1].(string); ok {
return host
}
}
if parsedURL, err := url.Parse(link); err == nil {
return parsedURL.Host
}
return ""
}
func deriveThumbnail(link string) string {
if videoID := extractYouTubeID(link); videoID != "" {
return "https://i.ytimg.com/vi/" + videoID + "/hqdefault.jpg"
}
return ""
}
func extractYouTubeID(link string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?:v=|\/shorts\/|\/embed\/)([A-Za-z0-9_-]{11})`),
regexp.MustCompile(`youtu\.be\/([A-Za-z0-9_-]{11})`),
}
for _, pattern := range patterns {
matches := pattern.FindStringSubmatch(link)
if len(matches) == 2 {
return matches[1]
}
}
return ""
}
func extractMetaContent(html, property string) string {
patterns := []*regexp.Regexp{
regexp.MustCompile(`(?i)<meta[^>]+property=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
regexp.MustCompile(`(?i)<meta[^>]+name=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
}
for _, pattern := range patterns {
matches := pattern.FindStringSubmatch(html)
if len(matches) == 2 {
return htmlUnescape(matches[1])
}
}
return ""
}
func extractVideoPreviewURL(html string) string {
pattern := regexp.MustCompile(`https?:\\?/\\?/[^"'\\s>]+(?:mp4|m3u8)`)
matches := pattern.FindAllString(html, -1)
for _, match := range matches {
candidate := strings.ReplaceAll(match, `\/`, `/`)
candidate = strings.ReplaceAll(candidate, `\u002F`, `/`)
candidate = strings.ReplaceAll(candidate, `\\`, "")
if strings.Contains(strings.ToLower(candidate), "preview") || strings.Contains(strings.ToLower(candidate), "video") {
return candidate
}
}
return ""
}
func extractArtgridClipID(link string) string {
matches := regexp.MustCompile(`/clip/([0-9]+)/`).FindStringSubmatch(link)
if len(matches) == 2 {
return matches[1]
}
return ""
}
func collectURLs(body string) []string {
pattern := regexp.MustCompile(`https?:\/\/[^"'\\\s]+`)
matches := pattern.FindAllString(body, -1)
seen := map[string]bool{}
results := make([]string, 0, len(matches))
for _, match := range matches {
candidate := strings.TrimSpace(strings.Trim(match, `"'`))
if candidate == "" || seen[candidate] {
continue
}
seen[candidate] = true
results = append(results, candidate)
}
return results
}
func pickImageURL(urls []string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, ".jpg") || strings.Contains(lower, ".jpeg") || strings.Contains(lower, ".png") || strings.Contains(lower, ".webp") {
return item
}
}
return ""
}
func pickVideoURL(urls []string) string {
for _, item := range urls {
lower := strings.ToLower(item)
if strings.Contains(lower, ".mp4") || strings.Contains(lower, ".m3u8") {
return item
}
}
return ""
}
func (s *SearchService) fetchText(target string) (string, error) {
resp, err := s.Client.Get(target)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("fetch returned status %d", resp.StatusCode)
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
if err != nil {
return "", err
}
return string(data), nil
}
func (s *SearchService) fetchJSONText(target string) (string, error) {
req, err := http.NewRequest(http.MethodGet, target, nil)
if err != nil {
return "", err
}
req.Header.Set("Accept", "application/json, text/json")
resp, err := s.Client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("json fetch returned status %d", resp.StatusCode)
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
if err != nil {
return "", err
}
return string(data), nil
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
func limitQueries(queries []string, limit int) []string {
seen := map[string]bool{}
filtered := make([]string, 0, minInt(len(queries), limit))
for _, item := range queries {
trimmed := strings.TrimSpace(item)
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
filtered = append(filtered, trimmed)
if len(filtered) >= limit {
break
}
}
return filtered
}
func htmlUnescape(text string) string {
replacer := strings.NewReplacer("&amp;", "&", "&quot;", `"`, "&#39;", "'", "&lt;", "<", "&gt;", ">")
return replacer.Replace(text)
}
func sourceWeight(source string) int {
switch source {
case "Envato":
return 3
case "Artgrid":
return 2
case "Google Video":
return 1
default:
return 0
}
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}