From b6a217cab9adb6f8fc91e09d115009bbdab54851 Mon Sep 17 00:00:00 2001 From: GHStaK Date: Tue, 17 Mar 2026 17:23:05 +0900 Subject: [PATCH] Harden single-candidate gemini recovery --- TODO.md | 18 ++++ backend/services/gemini.go | 141 +++++++++++++++++++++++++++++--- backend/services/gemini_test.go | 14 ++++ 3 files changed, 161 insertions(+), 12 deletions(-) diff --git a/TODO.md b/TODO.md index 37303df..a162e73 100644 --- a/TODO.md +++ b/TODO.md @@ -655,6 +655,24 @@ - If behavior in the browser does not match the latest backend/frontend code, the first assumption should be stale frontend assets until proven otherwise ## Recent Change Log +- Date: `2026-03-17` +- What changed: + - Added a dedicated single-candidate Gemini recovery path that no longer asks for JSON and instead parses a tiny plain-text key/value response. + - Kept multi-candidate Gemini Vision on compact JSON, but changed sequential recovery to use the shorter plain-text format automatically through the existing `Recommend(..., []SearchResult{item})` path. + - Added unit coverage for the single-candidate plain-text parser. +- Why it changed: + - The user-provided log `ai-media-hub-2026-03-17T08-20-31-074Z.log` showed even more severe truncation: + - `"{\"recommendations\":[{\"index\":"` + - `"{\"recommendations"` + - `"{\"recommend"` + - The same log showed `sequentialRetried: 0`, which means the old single-candidate recovery path was still too verbose and was not successfully rescuing failed batches. +- How it was verified: + - `pwsh -NoProfile -File scripts/selftest.ps1` + - added Go tests for single-candidate Gemini plain-text parsing +- What is still risky or incomplete: + - If Gemini returns malformed plain text that omits the required `verdict:` line, even the single-candidate recovery path can still fail. + - This improves recovery robustness, but total Gemini latency can still rise when many batch failures fall back to candidate-by-candidate evaluation. + - Date: `2026-03-17` - What changed: - Added adaptive Gemini Vision output-token sizing so smaller candidate batches, especially single-candidate sequential recovery calls, now request much shorter responses. diff --git a/backend/services/gemini.go b/backend/services/gemini.go index 44110bc..136ec23 100644 --- a/backend/services/gemini.go +++ b/backend/services/gemini.go @@ -245,6 +245,9 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI if len(candidates) == 0 { return []AIRecommendation{}, nil } + if len(candidates) == 1 { + return g.recommendSingleCandidate(query, candidates[0]) + } g.debug("gemini:vision_start", map[string]any{ "query": query, "candidateCount": len(candidates), @@ -368,19 +371,92 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI return recommendations, nil } -func buildGeminiVisionInstruction(query string, candidateCount int) string { - if candidateCount <= 1 { - return `You are a professional video editor. Analyze the single provided visual for the user's keyword. -Return compact JSON only in this exact shape: -{"recommendations":[{"index":0,"verdict":"Yes","reason":"짧은 한국어 문장","recommended":true,"assessment":"positive","searchHint":""}]} -Return exactly one item. -Use a very short Korean reason. -Use verdict "Yes" or "No". -Set assessment to one of: positive, unclear, irrelevant, inappropriate. -Keep searchHint empty unless the visual is weak or irrelevant, then use a very short English stock-footage phrase. -No markdown fences. No commentary. Compact JSON only. -User query: ` + query +func (g *GeminiService) recommendSingleCandidate(query string, candidate SearchResult) ([]AIRecommendation, error) { + g.debug("gemini:vision_start", map[string]any{ + "query": query, + "candidateCount": 1, + "mode": "single_candidate_recovery", + }) + + img, mimeType, err := g.fetchCandidateVisualInlineData(candidate) + if err != nil { + g.debug("gemini:vision_candidate_visual_error", map[string]any{ + "index": 0, + "link": candidate.Link, + "source": candidate.Source, + "error": err.Error(), + }) + return nil, err } + + g.debug("gemini:vision_visuals_prepared", map[string]any{ + "query": query, + "visualCount": 1, + "maxImages": 1, + "maxOutputTokens": 120, + "mode": "single_candidate_recovery", + }) + + body := map[string]any{ + "contents": []map[string]any{ + { + "parts": []map[string]any{ + { + "text": `You are a professional video editor. Analyze the single provided visual for the user's keyword. +Return plain text only with exactly these 5 lines: +verdict: Yes or No +assessment: positive or unclear or irrelevant or inappropriate +recommended: true or false +reason_ko: very short Korean reason +search_hint: short English stock-footage hint or empty +No JSON. No markdown. No extra text. +User query: ` + query, + }, + {"text": fmt.Sprintf("Candidate 0: title=%s source=%s link=%s", candidate.Title, candidate.Source, candidate.Link)}, + {"inlineData": map[string]string{"mimeType": mimeType, "data": img}}, + }, + }, + }, + "generationConfig": map[string]any{ + "responseMimeType": "text/plain", + "temperature": 0.1, + "maxOutputTokens": 120, + }, + } + + rawText, err := g.generateText(body) + if err != nil { + return nil, err + } + rec, err := parseSingleCandidateVisionText(rawText) + if err != nil { + return nil, fmt.Errorf("gemini single-candidate parse failed: %w; raw=%q", err, truncateForError(rawText, 200)) + } + + recommended := rec.Recommended || strings.EqualFold(strings.TrimSpace(rec.Verdict), "yes") + assessment := normalizeAssessment(rec.Assessment, recommended) + result := AIRecommendation{ + Title: candidate.Title, + Link: candidate.Link, + Snippet: candidate.Snippet, + ThumbnailURL: candidate.ThumbnailURL, + PreviewVideoURL: candidate.PreviewVideoURL, + Source: candidate.Source, + Reason: normalizeKoreanReason(rec.Reason), + Recommended: recommended, + Assessment: assessment, + SearchHint: normalizeSearchHint(rec.SearchHint), + } + + g.debug("gemini:vision_complete", map[string]any{ + "query": query, + "recommendationCount": 1, + "mode": "single_candidate_recovery", + }) + return []AIRecommendation{result}, nil +} + +func buildGeminiVisionInstruction(query string, _ int) string { return `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape: {"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true,"assessment":"positive","searchHint":"short english hint"}]} Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness. @@ -472,6 +548,47 @@ func parseGeminiVisionRecommendations(raw string) (geminiVisionParsedPayload, bo return parsed, true, nil } +type singleCandidateVisionResponse struct { + Verdict string + Assessment string + Recommended bool + Reason string + SearchHint string +} + +func parseSingleCandidateVisionText(raw string) (singleCandidateVisionResponse, error) { + lines := strings.Split(strings.ReplaceAll(strings.TrimSpace(raw), "\r\n", "\n"), "\n") + result := singleCandidateVisionResponse{} + for _, line := range lines { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + parts := strings.SplitN(trimmed, ":", 2) + if len(parts) != 2 { + continue + } + key := strings.ToLower(strings.TrimSpace(parts[0])) + value := strings.TrimSpace(parts[1]) + switch key { + case "verdict": + result.Verdict = value + case "assessment": + result.Assessment = value + case "recommended": + result.Recommended = strings.EqualFold(value, "true") || strings.EqualFold(value, "yes") + case "reason_ko": + result.Reason = value + case "search_hint": + result.SearchHint = value + } + } + if strings.TrimSpace(result.Verdict) == "" { + return singleCandidateVisionResponse{}, fmt.Errorf("missing verdict line") + } + return result, nil +} + func extractCompleteRecommendationObjects(text string) []string { cleaned := strings.TrimSpace(text) cleaned = strings.TrimPrefix(cleaned, "```json") diff --git a/backend/services/gemini_test.go b/backend/services/gemini_test.go index f97770d..c62b61f 100644 --- a/backend/services/gemini_test.go +++ b/backend/services/gemini_test.go @@ -265,3 +265,17 @@ func TestGeminiVisionMaxOutputTokensShrinksForSingleCandidate(t *testing.T) { t.Fatalf("expected 420 tokens for four candidates, got %d", got) } } + +func TestParseSingleCandidateVisionTextParsesKeyValueResponse(t *testing.T) { + raw := "verdict: Yes\nassessment: positive\nrecommended: true\nreason_ko: 적합한 도시 야경\nsearch_hint: " + parsed, err := parseSingleCandidateVisionText(raw) + if err != nil { + t.Fatalf("expected parse success, got %v", err) + } + if parsed.Verdict != "Yes" || parsed.Assessment != "positive" || !parsed.Recommended { + t.Fatalf("unexpected parsed result: %#v", parsed) + } + if parsed.Reason != "적합한 도시 야경" { + t.Fatalf("unexpected reason: %#v", parsed) + } +}