From 9a33ecc6b501103c44645b5a4e61833c0373d2a8 Mon Sep 17 00:00:00 2001 From: GHStaK Date: Wed, 18 Mar 2026 13:00:41 +0900 Subject: [PATCH] Revert "Reduce gemini partial batch noise" This reverts commit 3be797131a8283a4c12491f66c7908456ce0c8b3. --- TODO.md | 16 ------- backend/services/gemini.go | 76 ++++++++++----------------------- backend/services/gemini_test.go | 9 ---- backend/services/ranker.go | 8 ++++ 4 files changed, 30 insertions(+), 79 deletions(-) diff --git a/TODO.md b/TODO.md index 37303df..a2fa523 100644 --- a/TODO.md +++ b/TODO.md @@ -655,22 +655,6 @@ - If behavior in the browser does not match the latest backend/frontend code, the first assumption should be stale frontend assets until proven otherwise ## Recent Change Log -- Date: `2026-03-17` -- What changed: - - Added adaptive Gemini Vision output-token sizing so smaller candidate batches, especially single-candidate sequential recovery calls, now request much shorter responses. - - Added a dedicated shorter single-candidate Gemini Vision instruction path for sequential recovery after batch failure. - - Stopped counting a batch as a strong user-facing partial failure when sequential recovery still salvages recommendations from that batch. - - Added unit coverage for the adaptive Gemini Vision token budget helper. -- Why it changed: - - The user-provided log `ai-media-hub-2026-03-17T07-55-17-127Z.log` still showed `gemini vision partially failed on 4 of 6 batches`. - - The same log also showed `sequentialRetried: 0`, which means the fallback single-candidate reevaluation path was still not recovering those truncated JSON batches well enough. -- How it was verified: - - `pwsh -NoProfile -File scripts/selftest.ps1` - - added Go tests for adaptive Gemini token sizing -- What is still risky or incomplete: - - This reduces partial-failure pressure further, but extremely short or malformed Gemini outputs can still fail before one complete recommendation object is emitted. - - Smaller recovery responses improve reliability, but repeated sequential recovery can still add latency on difficult searches. - - Date: `2026-03-17` - What changed: - Reduced Gemini Vision batch size from `6` to `4` so each model response carries fewer recommendation objects and is less likely to be truncated mid-JSON. diff --git a/backend/services/gemini.go b/backend/services/gemini.go index 44110bc..389fc08 100644 --- a/backend/services/gemini.go +++ b/backend/services/gemini.go @@ -253,7 +253,24 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI type geminiPart map[string]any parts := []geminiPart{ { - "text": buildGeminiVisionInstruction(query, len(candidates)), + "text": `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape: +{"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true,"assessment":"positive","searchHint":"short english hint"}]} +Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness. +Keep each Korean reason very short, ideally one sentence under 24 Korean characters when possible. +Set verdict to "Yes" or "No" for every candidate. "Yes" means the scene is usable and relevant for editing against the user's keyword. "No" means it is not suitable or not relevant enough. +Set recommended=true only when verdict is "Yes". Set recommended=false when verdict is "No". +Set assessment to one of: positive, unclear, irrelevant, inappropriate. +- positive: directly usable and relevant to the query +- unclear: visually ambiguous, weak, or not confident enough +- irrelevant: visibly unrelated to the query intent +- inappropriate: low-quality, spammy, misleading, meme-like, or otherwise unsuitable for professional editing +When assessment is not positive, provide searchHint as a short English stock-footage search phrase that could help find better candidates. Keep it under 8 words. +When assessment is positive, searchHint may be empty. +Do not include markdown fences, explanations, or comments. Output compact JSON only. +Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails. +Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery. +Favor scenes that look directly useful for professional editing, sequencing, establishing, cutaway, or mood-building usage. +User query: ` + query, }, } @@ -280,10 +297,9 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI return nil, fmt.Errorf("no candidate thumbnails or preview frames could be fetched for gemini vision") } g.debug("gemini:vision_visuals_prepared", map[string]any{ - "query": query, - "visualCount": visualCount, - "maxImages": maxImages, - "maxOutputTokens": geminiVisionMaxOutputTokens(visualCount), + "query": query, + "visualCount": visualCount, + "maxImages": maxImages, }) body := map[string]any{ @@ -293,7 +309,7 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI "generationConfig": map[string]any{ "responseMimeType": "application/json", "temperature": 0.1, - "maxOutputTokens": geminiVisionMaxOutputTokens(visualCount), + "maxOutputTokens": 900, }, } @@ -368,54 +384,6 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI return recommendations, nil } -func buildGeminiVisionInstruction(query string, candidateCount int) string { - if candidateCount <= 1 { - return `You are a professional video editor. Analyze the single provided visual for the user's keyword. -Return compact JSON only in this exact shape: -{"recommendations":[{"index":0,"verdict":"Yes","reason":"짧은 한국어 문장","recommended":true,"assessment":"positive","searchHint":""}]} -Return exactly one item. -Use a very short Korean reason. -Use verdict "Yes" or "No". -Set assessment to one of: positive, unclear, irrelevant, inappropriate. -Keep searchHint empty unless the visual is weak or irrelevant, then use a very short English stock-footage phrase. -No markdown fences. No commentary. Compact JSON only. -User query: ` + query - } - return `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape: -{"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true,"assessment":"positive","searchHint":"short english hint"}]} -Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness. -Keep each Korean reason very short, ideally one sentence under 24 Korean characters when possible. -Set verdict to "Yes" or "No" for every candidate. "Yes" means the scene is usable and relevant for editing against the user's keyword. "No" means it is not suitable or not relevant enough. -Set recommended=true only when verdict is "Yes". Set recommended=false when verdict is "No". -Set assessment to one of: positive, unclear, irrelevant, inappropriate. -- positive: directly usable and relevant to the query -- unclear: visually ambiguous, weak, or not confident enough -- irrelevant: visibly unrelated to the query intent -- inappropriate: low-quality, spammy, misleading, meme-like, or otherwise unsuitable for professional editing -When assessment is not positive, provide searchHint as a short English stock-footage search phrase that could help find better candidates. Keep it under 8 words. -When assessment is positive, searchHint may be empty. -Do not include markdown fences, explanations, or comments. Output compact JSON only. -Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails. -Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery. -Favor scenes that look directly useful for professional editing, sequencing, establishing, cutaway, or mood-building usage. -User query: ` + query -} - -func geminiVisionMaxOutputTokens(candidateCount int) int { - switch { - case candidateCount <= 1: - return 180 - case candidateCount == 2: - return 260 - case candidateCount == 3: - return 340 - case candidateCount == 4: - return 420 - default: - return 520 - } -} - type geminiVisionParsedPayload struct { Recommendations []struct { Index int `json:"index"` diff --git a/backend/services/gemini_test.go b/backend/services/gemini_test.go index f97770d..439e31a 100644 --- a/backend/services/gemini_test.go +++ b/backend/services/gemini_test.go @@ -256,12 +256,3 @@ func TestExtractCompleteRecommendationObjectsReturnsNilWhenArrayMissing(t *testi t.Fatalf("expected no objects, got %#v", got) } } - -func TestGeminiVisionMaxOutputTokensShrinksForSingleCandidate(t *testing.T) { - if got := geminiVisionMaxOutputTokens(1); got != 180 { - t.Fatalf("expected 180 tokens for single candidate, got %d", got) - } - if got := geminiVisionMaxOutputTokens(4); got != 420 { - t.Fatalf("expected 420 tokens for four candidates, got %d", got) - } -} diff --git a/backend/services/ranker.go b/backend/services/ranker.go index ad796f4..981bb51 100644 --- a/backend/services/ranker.go +++ b/backend/services/ranker.go @@ -198,6 +198,14 @@ func EvaluateAllCandidatesWithGeminiWithDeadline(service *GeminiService, query s seen[item.Link] = true merged = append(merged, item) } + if len(hardErrs) > 0 { + stats.Failed++ + for _, recoveredErr := range hardErrs { + if len(stats.Errors) < 5 { + stats.Errors = append(stats.Errors, recoveredErr) + } + } + } continue } if len(hardErrs) == 0 {