Fix Artgrid collector matching and split ranker
build-push / docker (push) Successful in 4m16s

This commit is contained in:
AI Assistant
2026-03-13 19:31:57 +09:00
parent 5aebbef639
commit e4262613c3
6 changed files with 250 additions and 165 deletions
+12
View File
@@ -105,6 +105,18 @@
- Current implementation note:
- collectors are still in Go code under backend services, but the responsibilities are now separated by source instead of one monolithic search loop
## Current Session Update (2026-03-13, Artgrid Collector Fix + Ranker Split)
- Artgrid collector regression fixed:
- real search results can come back as `artlist.io/stock-footage/clip/.../<id>` instead of only `artgrid.io/clip/<id>/...`
- renderable filtering was rejecting those URLs, which caused `SearXNG returned no renderable results.` for Artgrid-only searches
- Fix applied:
- Artgrid renderability now accepts both `artgrid.io` and `artlist.io/stock-footage/clip/...` clip URLs
- Artgrid result links are normalized into `https://artgrid.io/clip/<id>/<slug>` inside the collector flow before filtering/enrichment
- Refactor continued:
- ranking / Gemini candidate evaluation / recommendation merge logic moved out of `handlers/api.go`
- new service layer file: `backend/services/ranker.go`
- handler is now thinner and less coupled to search internals
## Local Self-Test Workflow
- Primary command:
- `bash scripts/selftest.sh`
+4 -158
View File
@@ -11,7 +11,6 @@ import (
"os/exec"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"time"
@@ -87,16 +86,6 @@ type searchDebugSummary struct {
GeminiCandidateCap int `json:"geminiCandidateCap,omitempty"`
}
type geminiBatchStats struct {
CandidateCap int `json:"candidateCap"`
Requested int `json:"requested"`
Batches int `json:"batches"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
RecommendedCount int `json:"recommendedCount"`
Errors []string `json:"errors,omitempty"`
}
func RegisterRoutes(router *gin.Engine, app *App) {
router.GET("/healthz", func(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"status": "ok"})
@@ -329,10 +318,10 @@ func (a *App) searchMedia(c *gin.Context) {
if len(queryVariants) > 0 {
rankQuery = strings.Join(queryVariants[:min(len(queryVariants), 3)], " ")
}
scored := rankSearchResults(rankQuery, results)
a.debug("search ranked summary", summarizeSearchResults(scored, time.Since(started), geminiCandidateLimit(len(scored)), ""))
scored := services.RankSearchResults(rankQuery, results)
a.debug("search ranked summary", summarizeSearchResults(scored, time.Since(started), services.GeminiCandidateLimit(len(scored)), ""))
a.Hub.Broadcast("progress", gin.H{"type": "search", "status": "analyzing top candidate visuals with Gemini Vision", "progress": 75})
recommended, geminiStats := evaluateAllCandidatesWithGemini(a.GeminiService, req.Query, scored)
recommended, geminiStats := services.EvaluateAllCandidatesWithGemini(a.GeminiService, req.Query, scored)
a.debug("search gemini evaluation", geminiStats)
err = nil
if len(recommended) == 0 {
@@ -359,7 +348,7 @@ func (a *App) searchMedia(c *gin.Context) {
return
}
merged := mergeRecommendations(recommended, scored, 20)
merged := services.MergeRecommendations(recommended, scored, 20)
a.debug("search complete summary", summarizeRecommendationResults(merged, time.Since(started), ""))
response := gin.H{"results": merged, "queries": queryVariants}
a.Hub.Broadcast("progress", gin.H{"type": "search", "status": "search complete", "progress": 100})
@@ -438,149 +427,6 @@ func selectedPlatformLabel(platforms map[string]bool) string {
return strings.Join(labels, ", ")
}
func evaluateAllCandidatesWithGemini(service *services.GeminiService, query string, ranked []services.SearchResult) ([]services.AIRecommendation, geminiBatchStats) {
const chunkSize = 8
limit := geminiCandidateLimit(len(ranked))
stats := geminiBatchStats{
CandidateCap: limit,
Requested: min(limit, len(ranked)),
}
merged := make([]services.AIRecommendation, 0, len(ranked))
seen := map[string]bool{}
for start := 0; start < limit; start += chunkSize {
end := start + chunkSize
if end > limit {
end = limit
}
batch := ranked[start:end]
stats.Batches++
recommended, err := service.Recommend(query, batch)
if err != nil {
stats.Failed++
if len(stats.Errors) < 5 {
stats.Errors = append(stats.Errors, err.Error())
}
continue
}
stats.Succeeded++
for _, item := range recommended {
if item.Link == "" || seen[item.Link] {
continue
}
seen[item.Link] = true
merged = append(merged, item)
}
}
stats.RecommendedCount = len(merged)
return merged, stats
}
func rankSearchResults(query string, results []services.SearchResult) []services.SearchResult {
queryTerms := strings.Fields(strings.ToLower(query))
positiveTerms := []string{
"b-roll", "b roll", "stock", "stock footage", "footage", "cinematic", "editorial",
"establishing", "4k", "hd", "drone", "ambient", "scene", "urban", "cityscape",
}
negativeTerms := []string{
"shocking", "amazing", "crazy", "must watch", "reaction", "gossip", "celebrity",
"thumbnail", "meme", "prank", "drama", "breaking", "viral", "tutorial",
"how to", "review", "walkthrough", "course", "lesson", "podcast", "interview",
"premiere pro", "after effects", "explained", "breakdown", "vlog",
}
type scoredResult struct {
item services.SearchResult
score int
}
scored := make([]scoredResult, 0, len(results))
for _, result := range results {
score := 0
text := strings.ToLower(result.Title + " " + result.Snippet + " " + result.Source)
for _, term := range queryTerms {
if strings.Contains(text, term) {
score += 3
}
}
for _, term := range positiveTerms {
if strings.Contains(text, term) {
score += 2
}
}
for _, term := range negativeTerms {
if strings.Contains(text, term) {
score -= 4
}
}
if result.ThumbnailURL != "" {
score += 2
}
if result.PreviewVideoURL != "" {
score += 3
}
switch result.Source {
case "Google Video":
score -= 1
case "Envato":
score += 7
case "Artgrid":
score += 7
}
scored = append(scored, scoredResult{item: result, score: score})
}
sort.SliceStable(scored, func(i, j int) bool {
return scored[i].score > scored[j].score
})
ranked := make([]services.SearchResult, 0, len(scored))
for _, item := range scored {
ranked = append(ranked, item.item)
}
return ranked
}
func mergeRecommendations(recommended []services.AIRecommendation, ranked []services.SearchResult, limit int) []services.AIRecommendation {
merged := make([]services.AIRecommendation, 0, min(limit, len(ranked)))
seen := map[string]bool{}
for _, item := range recommended {
if item.Link == "" || seen[item.Link] {
continue
}
seen[item.Link] = true
merged = append(merged, item)
}
for _, item := range ranked {
if len(merged) >= limit || item.Link == "" || seen[item.Link] {
continue
}
seen[item.Link] = true
merged = append(merged, services.AIRecommendation{
Title: item.Title,
Link: item.Link,
Snippet: item.Snippet,
ThumbnailURL: item.ThumbnailURL,
PreviewVideoURL: item.PreviewVideoURL,
Source: item.Source,
Reason: "Keyword-ranked result added without extra Gemini vision tokens.",
Recommended: true,
})
}
return merged
}
func geminiCandidateLimit(total int) int {
switch {
case total <= 8:
return total
case total <= 16:
return 12
default:
return 16
}
}
func summarizeSearchResults(results []services.SearchResult, duration time.Duration, geminiCap int, warning string) searchDebugSummary {
bySource := map[string]int{}
withPreview := 0
+60 -6
View File
@@ -90,6 +90,7 @@ func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[strin
continue
}
for _, item := range items {
item = normalizeResultForCollector(collector.Name(), item)
if item.Link == "" || seen[item.Link] || !collector.Accept(item) {
continue
}
@@ -379,10 +380,16 @@ func isRenderableArtgridResult(result SearchResult) bool {
if err != nil {
return false
}
if !strings.Contains(strings.ToLower(parsed.Host), "artgrid.io") {
host := strings.ToLower(parsed.Host)
switch {
case strings.Contains(host, "artgrid.io"):
return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path)
case strings.Contains(host, "artlist.io"):
trimmedPath := strings.TrimSuffix(parsed.Path, "/")
return regexp.MustCompile(`^/stock-footage/clip/.+/[0-9]+$`).MatchString(trimmedPath)
default:
return false
}
return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path)
}
func normalizeSource(source, link, engine string) string {
@@ -391,7 +398,7 @@ func normalizeSource(source, link, engine string) string {
return source
case strings.Contains(strings.ToLower(link), "envato") || strings.Contains(strings.ToLower(link), "videohive"):
return "Envato"
case strings.Contains(strings.ToLower(link), "artgrid"):
case strings.Contains(strings.ToLower(link), "artgrid"), strings.Contains(strings.ToLower(link), "artlist.io/stock-footage/clip/"):
return "Artgrid"
case strings.Contains(strings.ToLower(engine), "google"):
return "Google Video"
@@ -473,13 +480,60 @@ func extractArtgridBackgroundThumbnail(html, clipID string) string {
}
func extractArtgridClipID(link string) string {
matches := regexp.MustCompile(`/clip/([0-9]+)/`).FindStringSubmatch(link)
if len(matches) == 2 {
return matches[1]
patterns := []*regexp.Regexp{
regexp.MustCompile(`/clip/([0-9]+)/`),
regexp.MustCompile(`/stock-footage/clip/[^/]+/([0-9]+)$`),
regexp.MustCompile(`/stock-footage/clip/.+/([0-9]+)$`),
}
for _, pattern := range patterns {
matches := pattern.FindStringSubmatch(strings.TrimSuffix(link, "/"))
if len(matches) == 2 {
return matches[1]
}
}
return ""
}
func canonicalizeArtgridLink(link string) string {
trimmed := strings.TrimSpace(link)
if trimmed == "" {
return ""
}
clipID := extractArtgridClipID(trimmed)
if clipID == "" {
return trimmed
}
if strings.Contains(strings.ToLower(trimmed), "artgrid.io/clip/") {
return trimmed
}
parsed, err := url.Parse(trimmed)
if err != nil {
return trimmed
}
segments := strings.Split(strings.Trim(parsed.Path, "/"), "/")
slug := clipID
for idx, segment := range segments {
if segment == clipID && idx > 0 {
slug = segments[idx-1]
break
}
}
return "https://artgrid.io/clip/" + clipID + "/" + slug
}
func normalizeResultForCollector(source string, result SearchResult) SearchResult {
switch source {
case "Artgrid":
result.Link = canonicalizeArtgridLink(result.Link)
result.Source = "Artgrid"
case "Envato":
result.Source = "Envato"
case "Google Video":
result.Source = "Google Video"
}
return result
}
func collectURLs(body string) []string {
pattern := regexp.MustCompile(`https?:\/\/[^"'\\\s]+`)
matches := pattern.FindAllString(body, -1)
+14
View File
@@ -49,3 +49,17 @@ func TestCleanArtgridTitle(t *testing.T) {
t.Fatalf("expected %q, got %q", want, got)
}
}
func TestCanonicalizeArtgridLinkFromArtlist(t *testing.T) {
got := canonicalizeArtgridLink("https://artlist.io/stock-footage/clip/movie-film-moving-slowly-from-a-reel/114756")
want := "https://artgrid.io/clip/114756/movie-film-moving-slowly-from-a-reel"
if got != want {
t.Fatalf("expected %q, got %q", want, got)
}
}
func TestIsRenderableArtgridResultAcceptsArtlistCanonical(t *testing.T) {
if !isRenderableArtgridResult(SearchResult{Link: "https://artlist.io/stock-footage/clip/movie-film-moving-slowly-from-a-reel/114756"}) {
t.Fatal("expected artlist canonical clip URL to be accepted for Artgrid collector")
}
}
+159
View File
@@ -0,0 +1,159 @@
package services
import (
"sort"
"strings"
)
type GeminiBatchStats struct {
CandidateCap int `json:"candidateCap"`
Requested int `json:"requested"`
Batches int `json:"batches"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
RecommendedCount int `json:"recommendedCount"`
Errors []string `json:"errors,omitempty"`
}
func RankSearchResults(query string, results []SearchResult) []SearchResult {
queryTerms := strings.Fields(strings.ToLower(query))
positiveTerms := []string{
"b-roll", "b roll", "stock", "stock footage", "footage", "cinematic", "editorial",
"establishing", "4k", "hd", "drone", "ambient", "scene", "urban", "cityscape",
}
negativeTerms := []string{
"shocking", "amazing", "crazy", "must watch", "reaction", "gossip", "celebrity",
"thumbnail", "meme", "prank", "drama", "breaking", "viral", "tutorial",
"how to", "review", "walkthrough", "course", "lesson", "podcast", "interview",
"premiere pro", "after effects", "explained", "breakdown", "vlog",
}
type scoredResult struct {
item SearchResult
score int
}
scored := make([]scoredResult, 0, len(results))
for _, result := range results {
score := 0
text := strings.ToLower(result.Title + " " + result.Snippet + " " + result.Source)
for _, term := range queryTerms {
if strings.Contains(text, term) {
score += 3
}
}
for _, term := range positiveTerms {
if strings.Contains(text, term) {
score += 2
}
}
for _, term := range negativeTerms {
if strings.Contains(text, term) {
score -= 4
}
}
if result.ThumbnailURL != "" {
score += 2
}
if result.PreviewVideoURL != "" {
score += 3
}
switch result.Source {
case "Google Video":
score -= 1
case "Envato":
score += 7
case "Artgrid":
score += 7
}
scored = append(scored, scoredResult{item: result, score: score})
}
sort.SliceStable(scored, func(i, j int) bool {
return scored[i].score > scored[j].score
})
ranked := make([]SearchResult, 0, len(scored))
for _, item := range scored {
ranked = append(ranked, item.item)
}
return ranked
}
func GeminiCandidateLimit(total int) int {
switch {
case total <= 8:
return total
case total <= 16:
return 12
default:
return 16
}
}
func EvaluateAllCandidatesWithGemini(service *GeminiService, query string, ranked []SearchResult) ([]AIRecommendation, GeminiBatchStats) {
const chunkSize = 8
limit := GeminiCandidateLimit(len(ranked))
stats := GeminiBatchStats{
CandidateCap: limit,
Requested: min(limit, len(ranked)),
}
merged := make([]AIRecommendation, 0, len(ranked))
seen := map[string]bool{}
for start := 0; start < limit; start += chunkSize {
end := start + chunkSize
if end > limit {
end = limit
}
batch := ranked[start:end]
stats.Batches++
recommended, err := service.Recommend(query, batch)
if err != nil {
stats.Failed++
if len(stats.Errors) < 5 {
stats.Errors = append(stats.Errors, err.Error())
}
continue
}
stats.Succeeded++
for _, item := range recommended {
if item.Link == "" || seen[item.Link] {
continue
}
seen[item.Link] = true
merged = append(merged, item)
}
}
stats.RecommendedCount = len(merged)
return merged, stats
}
func MergeRecommendations(recommended []AIRecommendation, ranked []SearchResult, limit int) []AIRecommendation {
merged := make([]AIRecommendation, 0, min(limit, len(ranked)))
seen := map[string]bool{}
for _, item := range recommended {
if item.Link == "" || seen[item.Link] {
continue
}
seen[item.Link] = true
merged = append(merged, item)
}
for _, item := range ranked {
if len(merged) >= limit || item.Link == "" || seen[item.Link] {
continue
}
seen[item.Link] = true
merged = append(merged, AIRecommendation{
Title: item.Title,
Link: item.Link,
Snippet: item.Snippet,
ThumbnailURL: item.ThumbnailURL,
PreviewVideoURL: item.PreviewVideoURL,
Source: item.Source,
Reason: "Keyword-ranked result added without extra Gemini vision tokens.",
Recommended: true,
})
}
return merged
}
+1 -1
View File
@@ -24,7 +24,7 @@ trap cleanup EXIT
cd "${ROOT_DIR}"
echo "[selftest] gofmt"
gofmt -w backend/main.go backend/handlers/api.go backend/models/db.go backend/services/cse.go backend/services/cse_test.go backend/services/search_collectors.go backend/services/gemini.go backend/services/gemini_test.go
gofmt -w backend/main.go backend/handlers/api.go backend/models/db.go backend/services/cse.go backend/services/cse_test.go backend/services/search_collectors.go backend/services/ranker.go backend/services/gemini.go backend/services/gemini_test.go
echo "[selftest] python syntax"
python3 -m py_compile worker/downloader.py scripts/mock_searxng.py