From e4262613c392f032e2b8707bbc37ab5f5c784e89 Mon Sep 17 00:00:00 2001 From: AI Assistant Date: Fri, 13 Mar 2026 19:31:57 +0900 Subject: [PATCH] Fix Artgrid collector matching and split ranker --- TODO.md | 12 +++ backend/handlers/api.go | 162 +---------------------------------- backend/services/cse.go | 66 ++++++++++++-- backend/services/cse_test.go | 14 +++ backend/services/ranker.go | 159 ++++++++++++++++++++++++++++++++++ scripts/selftest.sh | 2 +- 6 files changed, 250 insertions(+), 165 deletions(-) create mode 100644 backend/services/ranker.go diff --git a/TODO.md b/TODO.md index ed909b1..5c5186a 100644 --- a/TODO.md +++ b/TODO.md @@ -105,6 +105,18 @@ - Current implementation note: - collectors are still in Go code under backend services, but the responsibilities are now separated by source instead of one monolithic search loop +## Current Session Update (2026-03-13, Artgrid Collector Fix + Ranker Split) +- Artgrid collector regression fixed: + - real search results can come back as `artlist.io/stock-footage/clip/.../` instead of only `artgrid.io/clip//...` + - renderable filtering was rejecting those URLs, which caused `SearXNG returned no renderable results.` for Artgrid-only searches +- Fix applied: + - Artgrid renderability now accepts both `artgrid.io` and `artlist.io/stock-footage/clip/...` clip URLs + - Artgrid result links are normalized into `https://artgrid.io/clip//` inside the collector flow before filtering/enrichment +- Refactor continued: + - ranking / Gemini candidate evaluation / recommendation merge logic moved out of `handlers/api.go` + - new service layer file: `backend/services/ranker.go` + - handler is now thinner and less coupled to search internals + ## Local Self-Test Workflow - Primary command: - `bash scripts/selftest.sh` diff --git a/backend/handlers/api.go b/backend/handlers/api.go index d25b574..af5b5b5 100644 --- a/backend/handlers/api.go +++ b/backend/handlers/api.go @@ -11,7 +11,6 @@ import ( "os/exec" "path/filepath" "regexp" - "sort" "strings" "sync" "time" @@ -87,16 +86,6 @@ type searchDebugSummary struct { GeminiCandidateCap int `json:"geminiCandidateCap,omitempty"` } -type geminiBatchStats struct { - CandidateCap int `json:"candidateCap"` - Requested int `json:"requested"` - Batches int `json:"batches"` - Succeeded int `json:"succeeded"` - Failed int `json:"failed"` - RecommendedCount int `json:"recommendedCount"` - Errors []string `json:"errors,omitempty"` -} - func RegisterRoutes(router *gin.Engine, app *App) { router.GET("/healthz", func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"status": "ok"}) @@ -329,10 +318,10 @@ func (a *App) searchMedia(c *gin.Context) { if len(queryVariants) > 0 { rankQuery = strings.Join(queryVariants[:min(len(queryVariants), 3)], " ") } - scored := rankSearchResults(rankQuery, results) - a.debug("search ranked summary", summarizeSearchResults(scored, time.Since(started), geminiCandidateLimit(len(scored)), "")) + scored := services.RankSearchResults(rankQuery, results) + a.debug("search ranked summary", summarizeSearchResults(scored, time.Since(started), services.GeminiCandidateLimit(len(scored)), "")) a.Hub.Broadcast("progress", gin.H{"type": "search", "status": "analyzing top candidate visuals with Gemini Vision", "progress": 75}) - recommended, geminiStats := evaluateAllCandidatesWithGemini(a.GeminiService, req.Query, scored) + recommended, geminiStats := services.EvaluateAllCandidatesWithGemini(a.GeminiService, req.Query, scored) a.debug("search gemini evaluation", geminiStats) err = nil if len(recommended) == 0 { @@ -359,7 +348,7 @@ func (a *App) searchMedia(c *gin.Context) { return } - merged := mergeRecommendations(recommended, scored, 20) + merged := services.MergeRecommendations(recommended, scored, 20) a.debug("search complete summary", summarizeRecommendationResults(merged, time.Since(started), "")) response := gin.H{"results": merged, "queries": queryVariants} a.Hub.Broadcast("progress", gin.H{"type": "search", "status": "search complete", "progress": 100}) @@ -438,149 +427,6 @@ func selectedPlatformLabel(platforms map[string]bool) string { return strings.Join(labels, ", ") } -func evaluateAllCandidatesWithGemini(service *services.GeminiService, query string, ranked []services.SearchResult) ([]services.AIRecommendation, geminiBatchStats) { - const chunkSize = 8 - limit := geminiCandidateLimit(len(ranked)) - stats := geminiBatchStats{ - CandidateCap: limit, - Requested: min(limit, len(ranked)), - } - merged := make([]services.AIRecommendation, 0, len(ranked)) - seen := map[string]bool{} - for start := 0; start < limit; start += chunkSize { - end := start + chunkSize - if end > limit { - end = limit - } - batch := ranked[start:end] - stats.Batches++ - recommended, err := service.Recommend(query, batch) - if err != nil { - stats.Failed++ - if len(stats.Errors) < 5 { - stats.Errors = append(stats.Errors, err.Error()) - } - continue - } - stats.Succeeded++ - for _, item := range recommended { - if item.Link == "" || seen[item.Link] { - continue - } - seen[item.Link] = true - merged = append(merged, item) - } - } - stats.RecommendedCount = len(merged) - return merged, stats -} - -func rankSearchResults(query string, results []services.SearchResult) []services.SearchResult { - queryTerms := strings.Fields(strings.ToLower(query)) - positiveTerms := []string{ - "b-roll", "b roll", "stock", "stock footage", "footage", "cinematic", "editorial", - "establishing", "4k", "hd", "drone", "ambient", "scene", "urban", "cityscape", - } - negativeTerms := []string{ - "shocking", "amazing", "crazy", "must watch", "reaction", "gossip", "celebrity", - "thumbnail", "meme", "prank", "drama", "breaking", "viral", "tutorial", - "how to", "review", "walkthrough", "course", "lesson", "podcast", "interview", - "premiere pro", "after effects", "explained", "breakdown", "vlog", - } - type scoredResult struct { - item services.SearchResult - score int - } - - scored := make([]scoredResult, 0, len(results)) - for _, result := range results { - score := 0 - text := strings.ToLower(result.Title + " " + result.Snippet + " " + result.Source) - for _, term := range queryTerms { - if strings.Contains(text, term) { - score += 3 - } - } - for _, term := range positiveTerms { - if strings.Contains(text, term) { - score += 2 - } - } - for _, term := range negativeTerms { - if strings.Contains(text, term) { - score -= 4 - } - } - if result.ThumbnailURL != "" { - score += 2 - } - if result.PreviewVideoURL != "" { - score += 3 - } - switch result.Source { - case "Google Video": - score -= 1 - case "Envato": - score += 7 - case "Artgrid": - score += 7 - } - scored = append(scored, scoredResult{item: result, score: score}) - } - - sort.SliceStable(scored, func(i, j int) bool { - return scored[i].score > scored[j].score - }) - - ranked := make([]services.SearchResult, 0, len(scored)) - for _, item := range scored { - ranked = append(ranked, item.item) - } - return ranked -} - -func mergeRecommendations(recommended []services.AIRecommendation, ranked []services.SearchResult, limit int) []services.AIRecommendation { - merged := make([]services.AIRecommendation, 0, min(limit, len(ranked))) - seen := map[string]bool{} - - for _, item := range recommended { - if item.Link == "" || seen[item.Link] { - continue - } - seen[item.Link] = true - merged = append(merged, item) - } - - for _, item := range ranked { - if len(merged) >= limit || item.Link == "" || seen[item.Link] { - continue - } - seen[item.Link] = true - merged = append(merged, services.AIRecommendation{ - Title: item.Title, - Link: item.Link, - Snippet: item.Snippet, - ThumbnailURL: item.ThumbnailURL, - PreviewVideoURL: item.PreviewVideoURL, - Source: item.Source, - Reason: "Keyword-ranked result added without extra Gemini vision tokens.", - Recommended: true, - }) - } - return merged -} - -func geminiCandidateLimit(total int) int { - switch { - case total <= 8: - return total - case total <= 16: - return 12 - default: - return 16 - } -} - func summarizeSearchResults(results []services.SearchResult, duration time.Duration, geminiCap int, warning string) searchDebugSummary { bySource := map[string]int{} withPreview := 0 diff --git a/backend/services/cse.go b/backend/services/cse.go index 3fdf866..67d35e4 100644 --- a/backend/services/cse.go +++ b/backend/services/cse.go @@ -90,6 +90,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin continue } for _, item := range items { + item = normalizeResultForCollector(collector.Name(), item) if item.Link == "" || seen[item.Link] || !collector.Accept(item) { continue } @@ -379,10 +380,16 @@ func isRenderableArtgridResult(result SearchResult) bool { if err != nil { return false } - if !strings.Contains(strings.ToLower(parsed.Host), "artgrid.io") { + host := strings.ToLower(parsed.Host) + switch { + case strings.Contains(host, "artgrid.io"): + return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path) + case strings.Contains(host, "artlist.io"): + trimmedPath := strings.TrimSuffix(parsed.Path, "/") + return regexp.MustCompile(`^/stock-footage/clip/.+/[0-9]+$`).MatchString(trimmedPath) + default: return false } - return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path) } func normalizeSource(source, link, engine string) string { @@ -391,7 +398,7 @@ func normalizeSource(source, link, engine string) string { return source case strings.Contains(strings.ToLower(link), "envato") || strings.Contains(strings.ToLower(link), "videohive"): return "Envato" - case strings.Contains(strings.ToLower(link), "artgrid"): + case strings.Contains(strings.ToLower(link), "artgrid"), strings.Contains(strings.ToLower(link), "artlist.io/stock-footage/clip/"): return "Artgrid" case strings.Contains(strings.ToLower(engine), "google"): return "Google Video" @@ -473,13 +480,60 @@ func extractArtgridBackgroundThumbnail(html, clipID string) string { } func extractArtgridClipID(link string) string { - matches := regexp.MustCompile(`/clip/([0-9]+)/`).FindStringSubmatch(link) - if len(matches) == 2 { - return matches[1] + patterns := []*regexp.Regexp{ + regexp.MustCompile(`/clip/([0-9]+)/`), + regexp.MustCompile(`/stock-footage/clip/[^/]+/([0-9]+)$`), + regexp.MustCompile(`/stock-footage/clip/.+/([0-9]+)$`), + } + for _, pattern := range patterns { + matches := pattern.FindStringSubmatch(strings.TrimSuffix(link, "/")) + if len(matches) == 2 { + return matches[1] + } } return "" } +func canonicalizeArtgridLink(link string) string { + trimmed := strings.TrimSpace(link) + if trimmed == "" { + return "" + } + clipID := extractArtgridClipID(trimmed) + if clipID == "" { + return trimmed + } + if strings.Contains(strings.ToLower(trimmed), "artgrid.io/clip/") { + return trimmed + } + parsed, err := url.Parse(trimmed) + if err != nil { + return trimmed + } + segments := strings.Split(strings.Trim(parsed.Path, "/"), "/") + slug := clipID + for idx, segment := range segments { + if segment == clipID && idx > 0 { + slug = segments[idx-1] + break + } + } + return "https://artgrid.io/clip/" + clipID + "/" + slug +} + +func normalizeResultForCollector(source string, result SearchResult) SearchResult { + switch source { + case "Artgrid": + result.Link = canonicalizeArtgridLink(result.Link) + result.Source = "Artgrid" + case "Envato": + result.Source = "Envato" + case "Google Video": + result.Source = "Google Video" + } + return result +} + func collectURLs(body string) []string { pattern := regexp.MustCompile(`https?:\/\/[^"'\\\s]+`) matches := pattern.FindAllString(body, -1) diff --git a/backend/services/cse_test.go b/backend/services/cse_test.go index 493c85e..185cd77 100644 --- a/backend/services/cse_test.go +++ b/backend/services/cse_test.go @@ -49,3 +49,17 @@ func TestCleanArtgridTitle(t *testing.T) { t.Fatalf("expected %q, got %q", want, got) } } + +func TestCanonicalizeArtgridLinkFromArtlist(t *testing.T) { + got := canonicalizeArtgridLink("https://artlist.io/stock-footage/clip/movie-film-moving-slowly-from-a-reel/114756") + want := "https://artgrid.io/clip/114756/movie-film-moving-slowly-from-a-reel" + if got != want { + t.Fatalf("expected %q, got %q", want, got) + } +} + +func TestIsRenderableArtgridResultAcceptsArtlistCanonical(t *testing.T) { + if !isRenderableArtgridResult(SearchResult{Link: "https://artlist.io/stock-footage/clip/movie-film-moving-slowly-from-a-reel/114756"}) { + t.Fatal("expected artlist canonical clip URL to be accepted for Artgrid collector") + } +} diff --git a/backend/services/ranker.go b/backend/services/ranker.go new file mode 100644 index 0000000..ee5de4c --- /dev/null +++ b/backend/services/ranker.go @@ -0,0 +1,159 @@ +package services + +import ( + "sort" + "strings" +) + +type GeminiBatchStats struct { + CandidateCap int `json:"candidateCap"` + Requested int `json:"requested"` + Batches int `json:"batches"` + Succeeded int `json:"succeeded"` + Failed int `json:"failed"` + RecommendedCount int `json:"recommendedCount"` + Errors []string `json:"errors,omitempty"` +} + +func RankSearchResults(query string, results []SearchResult) []SearchResult { + queryTerms := strings.Fields(strings.ToLower(query)) + positiveTerms := []string{ + "b-roll", "b roll", "stock", "stock footage", "footage", "cinematic", "editorial", + "establishing", "4k", "hd", "drone", "ambient", "scene", "urban", "cityscape", + } + negativeTerms := []string{ + "shocking", "amazing", "crazy", "must watch", "reaction", "gossip", "celebrity", + "thumbnail", "meme", "prank", "drama", "breaking", "viral", "tutorial", + "how to", "review", "walkthrough", "course", "lesson", "podcast", "interview", + "premiere pro", "after effects", "explained", "breakdown", "vlog", + } + type scoredResult struct { + item SearchResult + score int + } + + scored := make([]scoredResult, 0, len(results)) + for _, result := range results { + score := 0 + text := strings.ToLower(result.Title + " " + result.Snippet + " " + result.Source) + for _, term := range queryTerms { + if strings.Contains(text, term) { + score += 3 + } + } + for _, term := range positiveTerms { + if strings.Contains(text, term) { + score += 2 + } + } + for _, term := range negativeTerms { + if strings.Contains(text, term) { + score -= 4 + } + } + if result.ThumbnailURL != "" { + score += 2 + } + if result.PreviewVideoURL != "" { + score += 3 + } + switch result.Source { + case "Google Video": + score -= 1 + case "Envato": + score += 7 + case "Artgrid": + score += 7 + } + scored = append(scored, scoredResult{item: result, score: score}) + } + + sort.SliceStable(scored, func(i, j int) bool { + return scored[i].score > scored[j].score + }) + + ranked := make([]SearchResult, 0, len(scored)) + for _, item := range scored { + ranked = append(ranked, item.item) + } + return ranked +} + +func GeminiCandidateLimit(total int) int { + switch { + case total <= 8: + return total + case total <= 16: + return 12 + default: + return 16 + } +} + +func EvaluateAllCandidatesWithGemini(service *GeminiService, query string, ranked []SearchResult) ([]AIRecommendation, GeminiBatchStats) { + const chunkSize = 8 + limit := GeminiCandidateLimit(len(ranked)) + stats := GeminiBatchStats{ + CandidateCap: limit, + Requested: min(limit, len(ranked)), + } + merged := make([]AIRecommendation, 0, len(ranked)) + seen := map[string]bool{} + for start := 0; start < limit; start += chunkSize { + end := start + chunkSize + if end > limit { + end = limit + } + batch := ranked[start:end] + stats.Batches++ + recommended, err := service.Recommend(query, batch) + if err != nil { + stats.Failed++ + if len(stats.Errors) < 5 { + stats.Errors = append(stats.Errors, err.Error()) + } + continue + } + stats.Succeeded++ + for _, item := range recommended { + if item.Link == "" || seen[item.Link] { + continue + } + seen[item.Link] = true + merged = append(merged, item) + } + } + stats.RecommendedCount = len(merged) + return merged, stats +} + +func MergeRecommendations(recommended []AIRecommendation, ranked []SearchResult, limit int) []AIRecommendation { + merged := make([]AIRecommendation, 0, min(limit, len(ranked))) + seen := map[string]bool{} + + for _, item := range recommended { + if item.Link == "" || seen[item.Link] { + continue + } + seen[item.Link] = true + merged = append(merged, item) + } + + for _, item := range ranked { + if len(merged) >= limit || item.Link == "" || seen[item.Link] { + continue + } + seen[item.Link] = true + merged = append(merged, AIRecommendation{ + Title: item.Title, + Link: item.Link, + Snippet: item.Snippet, + ThumbnailURL: item.ThumbnailURL, + PreviewVideoURL: item.PreviewVideoURL, + Source: item.Source, + Reason: "Keyword-ranked result added without extra Gemini vision tokens.", + Recommended: true, + }) + } + return merged +} diff --git a/scripts/selftest.sh b/scripts/selftest.sh index 89c4848..b16b08d 100755 --- a/scripts/selftest.sh +++ b/scripts/selftest.sh @@ -24,7 +24,7 @@ trap cleanup EXIT cd "${ROOT_DIR}" echo "[selftest] gofmt" -gofmt -w backend/main.go backend/handlers/api.go backend/models/db.go backend/services/cse.go backend/services/cse_test.go backend/services/search_collectors.go backend/services/gemini.go backend/services/gemini_test.go +gofmt -w backend/main.go backend/handlers/api.go backend/models/db.go backend/services/cse.go backend/services/cse_test.go backend/services/search_collectors.go backend/services/ranker.go backend/services/gemini.go backend/services/gemini_test.go echo "[selftest] python syntax" python3 -m py_compile worker/downloader.py scripts/mock_searxng.py