Revert "Replace gemini batch JSON protocol"

This reverts commit f5d76fc3ec.
2026-03-18 13:00:40 +09:00
parent f5d76fc3ec
commit acfad750ab
3 changed files with 25 additions and 106 deletions
@@ -655,26 +655,6 @@
 - If behavior in the browser does not match the latest backend/frontend code, the first assumption should be stale frontend assets until proven otherwise
 ## Recent Change Log
 - Date: `2026-03-17`
 - What changed:
  - Switched the primary multi-candidate Gemini Vision response format away from JSON and toward a compact line-based text protocol:
    - `index|verdict|assessment|recommended|reason_ko|search_hint`
  - Kept the older JSON parser only as a fallback path instead of the primary success path.
  - Reduced Gemini Vision output-token budgets again to better match the new shorter line-based format.
  - Added unit coverage for the new pipe-delimited Gemini batch parser.
 - Why it changed:
  - The user-provided log `ai-media-hub-2026-03-17T08-38-47-661Z.log` still showed all Gemini batches failing with JSON output truncated almost immediately:
    - `"{\"recommend"`
    - `"{\"recommendations\":[{\""`
    - `"{\"recommendations\":[{\"index"`
  - At that point the right fix was no longer “more JSON hardening”, but removing JSON as the primary batch transport format so completed lines can be recovered even when the tail of the response is cut off.
 - How it was verified:
  - `pwsh -NoProfile -File scripts/selftest.ps1`
  - added Go tests for line-based Gemini batch parsing
 - What is still risky or incomplete:
  - If Gemini returns text that does not follow either the pipe-delimited format or the JSON fallback shape, parsing can still fail.
  - The model prompt is now stricter and shorter, which improves reliability, but it can make reasons more terse than before.
 - Date: `2026-03-17`
 - What changed:
  - Added a dedicated single-candidate Gemini recovery path that no longer asks for JSON and instead parses a tiny plain-text key/value response.
@@ -13,7 +13,6 @@ import (
 	"os/exec"
 	"path/filepath"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -295,7 +294,7 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI
 			{"parts": parts},
 		},
 		"generationConfig": map[string]any{
-			"responseMimeType": "text/plain",
+			"responseMimeType": "application/json",
 			"temperature":      0.1,
 			"maxOutputTokens":  geminiVisionMaxOutputTokens(visualCount),
 		},
@@ -458,35 +457,38 @@ User query: ` + query,
 }
 func buildGeminiVisionInstruction(query string, _ int) string {
-	return `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword.
+	return `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape:
-Return plain text only.
+{"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true,"assessment":"positive","searchHint":"short english hint"}]}
-Return exactly one line per analyzed candidate in this exact format:
+Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness.
-index|verdict|assessment|recommended|reason_ko|search_hint
+Keep each Korean reason very short, ideally one sentence under 24 Korean characters when possible.
-Rules:
+Set verdict to "Yes" or "No" for every candidate. "Yes" means the scene is usable and relevant for editing against the user's keyword. "No" means it is not suitable or not relevant enough.
- index: integer candidate index
+Set recommended=true only when verdict is "Yes". Set recommended=false when verdict is "No".
- verdict: Yes or No
+Set assessment to one of: positive, unclear, irrelevant, inappropriate.
- assessment: positive or unclear or irrelevant or inappropriate
+- positive: directly usable and relevant to the query
- recommended: true or false
+- unclear: visually ambiguous, weak, or not confident enough
- reason_ko: very short Korean reason without line breaks and without |
+- irrelevant: visibly unrelated to the query intent
- search_hint: short English stock-footage phrase or empty, without |
+- inappropriate: low-quality, spammy, misleading, meme-like, or otherwise unsuitable for professional editing
-Do not include markdown fences, JSON, bullets, numbering, or any other text.
+When assessment is not positive, provide searchHint as a short English stock-footage search phrase that could help find better candidates. Keep it under 8 words.
 When assessment is positive, searchHint may be empty.
 Do not include markdown fences, explanations, or comments. Output compact JSON only.
 Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
 Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
 Favor scenes that look directly useful for professional editing, sequencing, establishing, cutaway, or mood-building usage.
 User query: ` + query
 }
 func geminiVisionMaxOutputTokens(candidateCount int) int {
 	switch {
 	case candidateCount <= 1:
 		return 120
 	case candidateCount == 2:
 		return 180
 	case candidateCount == 2:
 		return 260
 	case candidateCount == 3:
-		return 240
+		return 340
 	case candidateCount == 4:
-		return 300
+		return 420
 	default:
-		return 360
+		return 520
 	}
 }
@@ -502,10 +504,6 @@ type geminiVisionParsedPayload struct {
 }
 func parseGeminiVisionRecommendations(raw string) (geminiVisionParsedPayload, bool, error) {
 	if parsed, ok := parseGeminiVisionLines(raw); ok {
 		return parsed, false, nil
 	}
 	jsonText, err := extractJSONObject(raw)
 	if err == nil {
 		var parsed geminiVisionParsedPayload
@@ -550,51 +548,6 @@ func parseGeminiVisionRecommendations(raw string) (geminiVisionParsedPayload, bo
 	return parsed, true, nil
 }
 func parseGeminiVisionLines(raw string) (geminiVisionParsedPayload, bool) {
 	lines := strings.Split(strings.ReplaceAll(strings.TrimSpace(raw), "\r\n", "\n"), "\n")
 	parsed := geminiVisionParsedPayload{
 		Recommendations: make([]struct {
 			Index       int    `json:"index"`
 			Verdict     string `json:"verdict"`
 			Reason      string `json:"reason"`
 			Recommended bool   `json:"recommended"`
 			Assessment  string `json:"assessment"`
 			SearchHint  string `json:"searchHint"`
 		}, 0, len(lines)),
 	}
 	for _, line := range lines {
 		trimmed := strings.TrimSpace(strings.Trim(line, "`"))
 		if trimmed == "" {
 			continue
 		}
 		parts := strings.SplitN(trimmed, "|", 6)
 		if len(parts) != 6 {
 			continue
 		}
 		index, err := strconv.Atoi(strings.TrimSpace(parts[0]))
 		if err != nil {
 			continue
 		}
 		parsed.Recommendations = append(parsed.Recommendations, struct {
 			Index       int    `json:"index"`
 			Verdict     string `json:"verdict"`
 			Reason      string `json:"reason"`
 			Recommended bool   `json:"recommended"`
 			Assessment  string `json:"assessment"`
 			SearchHint  string `json:"searchHint"`
 		}{
 			Index:       index,
 			Verdict:     strings.TrimSpace(parts[1]),
 			Assessment:  strings.TrimSpace(parts[2]),
 			Recommended: strings.EqualFold(strings.TrimSpace(parts[3]), "true") || strings.EqualFold(strings.TrimSpace(parts[3]), "yes"),
 			Reason:      strings.TrimSpace(parts[4]),
 			SearchHint:  strings.TrimSpace(parts[5]),
 		})
 	}
 	return parsed, len(parsed.Recommendations) > 0
 }
 type singleCandidateVisionResponse struct {
 	Verdict     string
 	Assessment  string
@@ -258,11 +258,11 @@ func TestExtractCompleteRecommendationObjectsReturnsNilWhenArrayMissing(t *testi
 }
 func TestGeminiVisionMaxOutputTokensShrinksForSingleCandidate(t *testing.T) {
-	if got := geminiVisionMaxOutputTokens(1); got != 120 {
+	if got := geminiVisionMaxOutputTokens(1); got != 180 {
-		t.Fatalf("expected 120 tokens for single candidate, got %d", got)
+		t.Fatalf("expected 180 tokens for single candidate, got %d", got)
 	}
-	if got := geminiVisionMaxOutputTokens(4); got != 300 {
+	if got := geminiVisionMaxOutputTokens(4); got != 420 {
-		t.Fatalf("expected 300 tokens for four candidates, got %d", got)
+		t.Fatalf("expected 420 tokens for four candidates, got %d", got)
 	}
 }
@@ -279,17 +279,3 @@ func TestParseSingleCandidateVisionTextParsesKeyValueResponse(t *testing.T) {
 		t.Fatalf("unexpected reason: %#v", parsed)
 	}
 }
 func TestParseGeminiVisionLinesParsesPipeDelimitedRows(t *testing.T) {
 	raw := "0|Yes|positive|true|적합한 네온 도시|\n1|No|irrelevant|false|관련성 낮음|night city skyline"
 	parsed, ok := parseGeminiVisionLines(raw)
 	if !ok {
 		t.Fatal("expected pipe-delimited parser to succeed")
 	}
 	if len(parsed.Recommendations) != 2 {
 		t.Fatalf("unexpected parsed recommendations: %#v", parsed.Recommendations)
 	}
 	if parsed.Recommendations[0].Index != 0 || parsed.Recommendations[1].Index != 1 {
 		t.Fatalf("unexpected parsed indices: %#v", parsed.Recommendations)
 	}
 }