Replace gemini batch JSON protocol

2026-03-17 17:41:33 +09:00
parent b6a217cab9
commit f5d76fc3ec
3 changed files with 106 additions and 25 deletions
@@ -13,6 +13,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -294,7 +295,7 @@ func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AI
 			{"parts": parts},
 		},
 		"generationConfig": map[string]any{
-			"responseMimeType": "application/json",
+			"responseMimeType": "text/plain",
 			"temperature":      0.1,
 			"maxOutputTokens":  geminiVisionMaxOutputTokens(visualCount),
 		},
@@ -457,38 +458,35 @@ User query: ` + query,
 }

 func buildGeminiVisionInstruction(query string, _ int) string {
-	return `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape:
-{"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true,"assessment":"positive","searchHint":"short english hint"}]}
-Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness.
-Keep each Korean reason very short, ideally one sentence under 24 Korean characters when possible.
-Set verdict to "Yes" or "No" for every candidate. "Yes" means the scene is usable and relevant for editing against the user's keyword. "No" means it is not suitable or not relevant enough.
-Set recommended=true only when verdict is "Yes". Set recommended=false when verdict is "No".
-Set assessment to one of: positive, unclear, irrelevant, inappropriate.
- positive: directly usable and relevant to the query
- unclear: visually ambiguous, weak, or not confident enough
- irrelevant: visibly unrelated to the query intent
- inappropriate: low-quality, spammy, misleading, meme-like, or otherwise unsuitable for professional editing
-When assessment is not positive, provide searchHint as a short English stock-footage search phrase that could help find better candidates. Keep it under 8 words.
-When assessment is positive, searchHint may be empty.
-Do not include markdown fences, explanations, or comments. Output compact JSON only.
+	return `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword.
+Return plain text only.
+Return exactly one line per analyzed candidate in this exact format:
+index|verdict|assessment|recommended|reason_ko|search_hint
+Rules:
+- index: integer candidate index
+- verdict: Yes or No
+- assessment: positive or unclear or irrelevant or inappropriate
+- recommended: true or false
+- reason_ko: very short Korean reason without line breaks and without |
+- search_hint: short English stock-footage phrase or empty, without |
+Do not include markdown fences, JSON, bullets, numbering, or any other text.
 Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
 Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
-Favor scenes that look directly useful for professional editing, sequencing, establishing, cutaway, or mood-building usage.
 User query: ` + query
 }

 func geminiVisionMaxOutputTokens(candidateCount int) int {
 	switch {
 	case candidateCount <= 1:
-		return 180
+		return 120
 	case candidateCount == 2:
-		return 260
+		return 180
 	case candidateCount == 3:
-		return 340
+		return 240
 	case candidateCount == 4:
-		return 420
+		return 300
 	default:
-		return 520
+		return 360
 	}
 }

@@ -504,6 +502,10 @@ type geminiVisionParsedPayload struct {
 }

 func parseGeminiVisionRecommendations(raw string) (geminiVisionParsedPayload, bool, error) {
+	if parsed, ok := parseGeminiVisionLines(raw); ok {
+		return parsed, false, nil
+	}
+
 	jsonText, err := extractJSONObject(raw)
 	if err == nil {
 		var parsed geminiVisionParsedPayload
@@ -548,6 +550,51 @@ func parseGeminiVisionRecommendations(raw string) (geminiVisionParsedPayload, bo
 	return parsed, true, nil
 }

+func parseGeminiVisionLines(raw string) (geminiVisionParsedPayload, bool) {
+	lines := strings.Split(strings.ReplaceAll(strings.TrimSpace(raw), "\r\n", "\n"), "\n")
+	parsed := geminiVisionParsedPayload{
+		Recommendations: make([]struct {
+			Index       int    `json:"index"`
+			Verdict     string `json:"verdict"`
+			Reason      string `json:"reason"`
+			Recommended bool   `json:"recommended"`
+			Assessment  string `json:"assessment"`
+			SearchHint  string `json:"searchHint"`
+		}, 0, len(lines)),
+	}
+
+	for _, line := range lines {
+		trimmed := strings.TrimSpace(strings.Trim(line, "`"))
+		if trimmed == "" {
+			continue
+		}
+		parts := strings.SplitN(trimmed, "|", 6)
+		if len(parts) != 6 {
+			continue
+		}
+		index, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+		if err != nil {
+			continue
+		}
+		parsed.Recommendations = append(parsed.Recommendations, struct {
+			Index       int    `json:"index"`
+			Verdict     string `json:"verdict"`
+			Reason      string `json:"reason"`
+			Recommended bool   `json:"recommended"`
+			Assessment  string `json:"assessment"`
+			SearchHint  string `json:"searchHint"`
+		}{
+			Index:       index,
+			Verdict:     strings.TrimSpace(parts[1]),
+			Assessment:  strings.TrimSpace(parts[2]),
+			Recommended: strings.EqualFold(strings.TrimSpace(parts[3]), "true") || strings.EqualFold(strings.TrimSpace(parts[3]), "yes"),
+			Reason:      strings.TrimSpace(parts[4]),
+			SearchHint:  strings.TrimSpace(parts[5]),
+		})
+	}
+	return parsed, len(parsed.Recommendations) > 0
+}
+
 type singleCandidateVisionResponse struct {
 	Verdict     string
 	Assessment  string