Files
ai-media-hub/backend/services/gemini.go
T
GHStaK 513199f426
build-push / docker (push) Successful in 4m13s
Harden gemini vision JSON recovery
2026-03-17 16:33:09 +09:00

1110 lines
34 KiB
Go

package services
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
neturl "net/url"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"sync"
"time"
)
type GeminiService struct {
APIKey string
Client *http.Client
GenerateEndpoint string
TranslateEndpoint string
Debug func(message string, data any)
cacheMu sync.Mutex
visualCache map[string]cachedVisualData
translationCache map[string]cachedStringValue
expansionCache map[string]cachedExpansionValue
}
type cachedVisualData struct {
data string
mimeType string
expiresAt time.Time
}
type cachedStringValue struct {
value string
expiresAt time.Time
}
type cachedExpansionValue struct {
value []string
expiresAt time.Time
}
type AIRecommendation struct {
Title string `json:"title"`
Link string `json:"link"`
Snippet string `json:"snippet"`
ThumbnailURL string `json:"thumbnailUrl"`
PreviewVideoURL string `json:"previewVideoUrl"`
Source string `json:"source"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
Assessment string `json:"assessment,omitempty"`
SearchHint string `json:"searchHint,omitempty"`
MediaMode string `json:"mediaMode,omitempty"`
EmbedURL string `json:"embedUrl,omitempty"`
PreviewBlockedReason string `json:"previewBlockedReason,omitempty"`
ActionLabel string `json:"actionLabel,omitempty"`
ActionType string `json:"actionType,omitempty"`
SecondaryActionLabel string `json:"secondaryActionLabel,omitempty"`
}
type QueryExpansion struct {
Querywords []string `json:"querywords"`
}
func NewGeminiService(apiKey string) *GeminiService {
return &GeminiService{
APIKey: apiKey,
Client: &http.Client{Timeout: 40 * time.Second},
GenerateEndpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent",
TranslateEndpoint: "https://translate.googleapis.com/translate_a/single",
visualCache: map[string]cachedVisualData{},
translationCache: map[string]cachedStringValue{},
expansionCache: map[string]cachedExpansionValue{},
}
}
func (g *GeminiService) ExpandQuery(query string) ([]string, error) {
cacheKey := strings.TrimSpace(query)
if cached, ok := g.getCachedExpansion(cacheKey); ok {
g.debug("gemini:expand_query_cache_hit", map[string]any{"query": query, "expanded": cached})
return cached, nil
}
englishBase := g.TranslateQuery(query)
expanded := buildSearchQueries(query, englishBase)
g.setCachedExpansion(cacheKey, expanded, 15*time.Minute)
g.debug("gemini:expand_query", map[string]any{
"original": query,
"english": englishBase,
"expanded": expanded,
})
return expanded, nil
}
func (g *GeminiService) TranslateSummaryToKorean(text string) (string, error) {
trimmed := strings.TrimSpace(text)
if trimmed == "" {
return "", nil
}
cacheKey := "summary-ko\n" + trimmed
if cached, ok := g.getCachedTranslation(cacheKey); ok {
g.debug("gemini:summary_translate_cache_hit", map[string]any{"length": len(trimmed)})
return cached, nil
}
if !looksMostlyASCII(trimmed) {
g.setCachedTranslation(cacheKey, trimmed, 15*time.Minute)
return trimmed, nil
}
g.debug("gemini:summary_translate_attempt", map[string]any{"length": len(trimmed)})
translated, err := g.translateViaGoogleToTarget(trimmed, "ko")
if err != nil {
g.debug("gemini:summary_translate_error", map[string]any{"length": len(trimmed), "error": err.Error()})
return "", err
}
translated = strings.TrimSpace(translated)
if translated == "" {
return "", fmt.Errorf("google translate summary returned empty translation")
}
g.debug("gemini:summary_translate_success", map[string]any{"length": len(trimmed)})
g.setCachedTranslation(cacheKey, translated, 15*time.Minute)
return translated, nil
}
func (g *GeminiService) TranslateQuery(query string) string {
trimmed := strings.TrimSpace(query)
if trimmed == "" {
return ""
}
if cached, ok := g.getCachedTranslation(trimmed); ok {
g.debug("gemini:translate_cache_hit", map[string]any{"query": trimmed, "translated": cached})
return cached
}
normalizedIntent := normalizeKnownMediaPhrases(trimmed)
if looksMostlyASCII(normalizedIntent) {
result := strings.TrimSpace(normalizedIntent)
g.setCachedTranslation(trimmed, result, 15*time.Minute)
return result
}
if looksMostlyASCII(trimmed) {
g.setCachedTranslation(trimmed, trimmed, 15*time.Minute)
return trimmed
}
if g.APIKey != "" {
g.debug("gemini:translate_attempt", map[string]any{"mode": "gemini", "query": trimmed})
body := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{
{
"text": "You are a professional video editor. Infer stronger stock-footage and scene-search wording from the user's keyword, and expand it into natural English that a professional editor would use to find usable footage. Output one plain English search phrase only. No labels, no quotes, no explanations.",
},
},
},
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": "Expand this user query into a concise but editor-grade English footage search phrase suitable for stock-footage discovery: " + trimmed,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "text/plain",
"temperature": 0.1,
"maxOutputTokens": 40,
},
}
rawText, err := g.generateText(body)
if err == nil {
translated := sanitizePlainEnglishLine(rawText)
if translated != "" && !strings.EqualFold(translated, trimmed) && !isOvercompressedTranslation(trimmed, translated) {
g.debug("gemini:translate_success", map[string]any{"mode": "gemini", "query": trimmed, "translated": translated})
g.setCachedTranslation(trimmed, translated, 15*time.Minute)
return translated
}
}
if err != nil {
g.debug("gemini:translate_error", map[string]any{"mode": "gemini", "query": trimmed, "error": err.Error()})
}
}
g.debug("gemini:translate_attempt", map[string]any{"mode": "google", "query": trimmed})
if translated, err := g.translateViaGoogle(trimmed); err == nil && translated != "" && isLikelyEnglishQuery(translated) && !isOvercompressedTranslation(trimmed, translated) {
g.debug("gemini:translate_success", map[string]any{"mode": "google", "query": trimmed, "translated": translated})
g.setCachedTranslation(trimmed, translated, 15*time.Minute)
return translated
}
if translated := translateKoreanMediaTerms(normalizedIntent); translated != "" && !strings.EqualFold(translated, trimmed) {
g.debug("gemini:translate_success", map[string]any{"mode": "dictionary", "query": trimmed, "translated": translated})
g.setCachedTranslation(trimmed, translated, 15*time.Minute)
return translated
}
g.debug("gemini:translate_fallback_original", map[string]any{"query": trimmed, "normalized": normalizedIntent})
result := strings.TrimSpace(normalizedIntent)
g.setCachedTranslation(trimmed, result, 15*time.Minute)
return result
}
func (g *GeminiService) generateText(body map[string]any) (string, error) {
rawBody, _ := json.Marshal(body)
endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return "", fmt.Errorf("gemini request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", fmt.Errorf("gemini response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return "", fmt.Errorf("gemini returned no candidates")
}
return payload.Candidates[0].Content.Parts[0].Text, nil
}
func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) {
if g.APIKey == "" {
return nil, fmt.Errorf("gemini api key is not configured")
}
if len(candidates) == 0 {
return []AIRecommendation{}, nil
}
g.debug("gemini:vision_start", map[string]any{
"query": query,
"candidateCount": len(candidates),
})
type geminiPart map[string]any
parts := []geminiPart{
{
"text": `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape:
{"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true,"assessment":"positive","searchHint":"short english hint"}]}
Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness.
Keep each Korean reason very short, ideally one sentence under 24 Korean characters when possible.
Set verdict to "Yes" or "No" for every candidate. "Yes" means the scene is usable and relevant for editing against the user's keyword. "No" means it is not suitable or not relevant enough.
Set recommended=true only when verdict is "Yes". Set recommended=false when verdict is "No".
Set assessment to one of: positive, unclear, irrelevant, inappropriate.
- positive: directly usable and relevant to the query
- unclear: visually ambiguous, weak, or not confident enough
- irrelevant: visibly unrelated to the query intent
- inappropriate: low-quality, spammy, misleading, meme-like, or otherwise unsuitable for professional editing
When assessment is not positive, provide searchHint as a short English stock-footage search phrase that could help find better candidates. Keep it under 8 words.
When assessment is positive, searchHint may be empty.
Do not include markdown fences, explanations, or comments. Output compact JSON only.
Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
Favor scenes that look directly useful for professional editing, sequencing, establishing, cutaway, or mood-building usage.
User query: ` + query,
},
}
maxImages := min(len(candidates), 10)
visualCount := 0
for idx := 0; idx < maxImages; idx++ {
img, mimeType, err := g.fetchCandidateVisualInlineData(candidates[idx])
if err != nil {
g.debug("gemini:vision_candidate_visual_error", map[string]any{
"index": idx,
"link": candidates[idx].Link,
"source": candidates[idx].Source,
"error": err.Error(),
})
continue
}
visualCount++
parts = append(parts,
geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)},
geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}},
)
}
if visualCount == 0 {
return nil, fmt.Errorf("no candidate thumbnails or preview frames could be fetched for gemini vision")
}
g.debug("gemini:vision_visuals_prepared", map[string]any{
"query": query,
"visualCount": visualCount,
"maxImages": maxImages,
})
body := map[string]any{
"contents": []map[string]any{
{"parts": parts},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
"temperature": 0.1,
"maxOutputTokens": 900,
},
}
rawBody, _ := json.Marshal(body)
endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("gemini vision response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return nil, fmt.Errorf("gemini vision returned no candidates")
}
rawText := payload.Candidates[0].Content.Parts[0].Text
parsed, recoveredPartial, err := parseGeminiVisionRecommendations(rawText)
if err != nil {
return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err)
}
if recoveredPartial {
g.debug("gemini:vision_partial_json_recovered", map[string]any{
"query": query,
"candidateCount": len(candidates),
"recommendationCount": len(parsed.Recommendations),
})
}
recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations))
for _, rec := range parsed.Recommendations {
if rec.Index < 0 || rec.Index >= len(candidates) {
continue
}
src := candidates[rec.Index]
recommended := rec.Recommended || strings.EqualFold(strings.TrimSpace(rec.Verdict), "yes")
assessment := normalizeAssessment(rec.Assessment, recommended)
recommendations = append(recommendations, AIRecommendation{
Title: src.Title,
Link: src.Link,
Snippet: src.Snippet,
ThumbnailURL: src.ThumbnailURL,
PreviewVideoURL: src.PreviewVideoURL,
Source: src.Source,
Reason: normalizeKoreanReason(rec.Reason),
Recommended: recommended,
Assessment: assessment,
SearchHint: normalizeSearchHint(rec.SearchHint),
})
}
g.debug("gemini:vision_complete", map[string]any{
"query": query,
"recommendationCount": len(recommendations),
})
return recommendations, nil
}
type geminiVisionParsedPayload struct {
Recommendations []struct {
Index int `json:"index"`
Verdict string `json:"verdict"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
Assessment string `json:"assessment"`
SearchHint string `json:"searchHint"`
} `json:"recommendations"`
}
func parseGeminiVisionRecommendations(raw string) (geminiVisionParsedPayload, bool, error) {
jsonText, err := extractJSONObject(raw)
if err == nil {
var parsed geminiVisionParsedPayload
if unmarshalErr := json.Unmarshal([]byte(jsonText), &parsed); unmarshalErr != nil {
return geminiVisionParsedPayload{}, false, fmt.Errorf("json parse failed: %w; raw=%q", unmarshalErr, truncateForError(raw, 200))
}
return parsed, false, nil
}
objects := extractCompleteRecommendationObjects(raw)
if len(objects) == 0 {
return geminiVisionParsedPayload{}, false, err
}
parsed := geminiVisionParsedPayload{
Recommendations: make([]struct {
Index int `json:"index"`
Verdict string `json:"verdict"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
Assessment string `json:"assessment"`
SearchHint string `json:"searchHint"`
}, 0, len(objects)),
}
for _, objectText := range objects {
var item struct {
Index int `json:"index"`
Verdict string `json:"verdict"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
Assessment string `json:"assessment"`
SearchHint string `json:"searchHint"`
}
if unmarshalErr := json.Unmarshal([]byte(objectText), &item); unmarshalErr != nil {
continue
}
parsed.Recommendations = append(parsed.Recommendations, item)
}
if len(parsed.Recommendations) == 0 {
return geminiVisionParsedPayload{}, false, err
}
return parsed, true, nil
}
func extractCompleteRecommendationObjects(text string) []string {
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
recommendationsIndex := strings.Index(cleaned, `"recommendations"`)
if recommendationsIndex == -1 {
return nil
}
arrayStart := strings.Index(cleaned[recommendationsIndex:], "[")
if arrayStart == -1 {
return nil
}
arrayStart += recommendationsIndex
objects := make([]string, 0, 4)
inString := false
escaped := false
objectDepth := 0
objectStart := -1
for idx := arrayStart + 1; idx < len(cleaned); idx++ {
ch := cleaned[idx]
if escaped {
escaped = false
continue
}
if ch == '\\' && inString {
escaped = true
continue
}
if ch == '"' {
inString = !inString
continue
}
if inString {
continue
}
switch ch {
case '{':
if objectDepth == 0 {
objectStart = idx
}
objectDepth++
case '}':
if objectDepth == 0 {
continue
}
objectDepth--
if objectDepth == 0 && objectStart >= 0 {
objects = append(objects, cleaned[objectStart:idx+1])
objectStart = -1
}
case ']':
if objectDepth == 0 {
return objects
}
}
}
return objects
}
func (g *GeminiService) BuildSupplementalQueries(query string, existing []string, reviewed []AIRecommendation) ([]string, error) {
baseExisting := make([]string, 0, len(existing))
for _, item := range existing {
trimmed := strings.TrimSpace(item)
if trimmed != "" {
baseExisting = append(baseExisting, trimmed)
}
}
if len(baseExisting) == 0 {
baseExisting = append(baseExisting, query)
}
positive := make([]string, 0, 3)
negativeHints := make([]string, 0, 4)
sourceCounts := map[string]int{}
for _, item := range reviewed {
sourceCounts[item.Source]++
if item.Assessment == "positive" && len(positive) < 3 {
positive = append(positive, truncateForError(strings.TrimSpace(item.Title), 80))
}
if (item.Assessment == "irrelevant" || item.Assessment == "inappropriate" || item.Assessment == "unclear") && item.SearchHint != "" && len(negativeHints) < 4 {
negativeHints = append(negativeHints, item.SearchHint)
}
}
if g.APIKey == "" {
return nil, fmt.Errorf("gemini api key is not configured")
}
body := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{{
"text": "You generate improved stock-footage search phrases. Return 3 to 5 plain English search phrases only, one per line, no numbering, no quotes, no explanations.",
}},
},
"contents": []map[string]any{{
"parts": []map[string]string{{
"text": fmt.Sprintf("Original query: %s\nExisting search phrases: %s\nPositive candidate titles: %s\nNegative or weak search hints: %s\nSource distribution: Envato=%d, Artgrid=%d, Google Video=%d\nGenerate improved English search phrases that avoid weak or irrelevant results and increase provider diversity.",
query,
strings.Join(baseExisting, " | "),
strings.Join(positive, " | "),
strings.Join(negativeHints, " | "),
sourceCounts["Envato"],
sourceCounts["Artgrid"],
sourceCounts["Google Video"],
),
}},
}},
"generationConfig": map[string]any{
"responseMimeType": "text/plain",
"temperature": 0.3,
"maxOutputTokens": 120,
},
}
rawText, err := g.generateText(body)
if err != nil {
return nil, err
}
queries := parseSupplementalQueryLines(rawText)
if len(queries) == 0 {
return nil, fmt.Errorf("gemini returned no supplemental queries")
}
return queries, nil
}
func (g *GeminiService) debug(message string, data any) {
if g != nil && g.Debug != nil {
g.Debug(message, data)
}
}
func fetchImageAsInlineData(client *http.Client, imageURL, referer string) (string, string, error) {
if strings.TrimSpace(imageURL) == "" {
return "", "", fmt.Errorf("image url is empty")
}
req, reqErr := newBrowserStyleImageRequest(imageURL, referer)
if reqErr != nil {
return "", "", reqErr
}
resp, err := client.Do(req)
if err != nil {
return "", "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
mimeType, _, _ := mime.ParseMediaType(contentType)
if mimeType == "" || !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), mimeType, nil
}
func (g *GeminiService) getCachedVisual(key string) (string, string, bool) {
g.cacheMu.Lock()
defer g.cacheMu.Unlock()
entry, ok := g.visualCache[key]
if !ok {
return "", "", false
}
if time.Now().After(entry.expiresAt) {
delete(g.visualCache, key)
return "", "", false
}
return entry.data, entry.mimeType, true
}
func (g *GeminiService) setCachedVisual(key, data, mimeType string, ttl time.Duration) {
g.cacheMu.Lock()
defer g.cacheMu.Unlock()
g.visualCache[key] = cachedVisualData{
data: data,
mimeType: mimeType,
expiresAt: time.Now().Add(ttl),
}
}
func (g *GeminiService) getCachedTranslation(key string) (string, bool) {
g.cacheMu.Lock()
defer g.cacheMu.Unlock()
entry, ok := g.translationCache[key]
if !ok {
return "", false
}
if time.Now().After(entry.expiresAt) {
delete(g.translationCache, key)
return "", false
}
return entry.value, true
}
func (g *GeminiService) setCachedTranslation(key, value string, ttl time.Duration) {
g.cacheMu.Lock()
defer g.cacheMu.Unlock()
g.translationCache[key] = cachedStringValue{
value: value,
expiresAt: time.Now().Add(ttl),
}
}
func (g *GeminiService) getCachedExpansion(key string) ([]string, bool) {
g.cacheMu.Lock()
defer g.cacheMu.Unlock()
entry, ok := g.expansionCache[key]
if !ok {
return nil, false
}
if time.Now().After(entry.expiresAt) {
delete(g.expansionCache, key)
return nil, false
}
cloned := make([]string, len(entry.value))
copy(cloned, entry.value)
return cloned, true
}
func (g *GeminiService) setCachedExpansion(key string, value []string, ttl time.Duration) {
g.cacheMu.Lock()
defer g.cacheMu.Unlock()
cloned := make([]string, len(value))
copy(cloned, value)
g.expansionCache[key] = cachedExpansionValue{
value: cloned,
expiresAt: time.Now().Add(ttl),
}
}
func newBrowserStyleImageRequest(imageURL, referer string) (*http.Request, error) {
req, err := http.NewRequest(http.MethodGet, imageURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if strings.TrimSpace(referer) != "" {
req.Header.Set("Referer", referer)
}
return req, nil
}
func (g *GeminiService) fetchCandidateVisualInlineData(candidate SearchResult) (string, string, error) {
lastErr := fmt.Errorf("candidate has no thumbnail or preview video")
if candidate.PreviewVideoURL != "" && (candidate.Source == "Envato" || candidate.Source == "Artgrid") {
cacheKey := "frame\n" + candidate.PreviewVideoURL
if data, mimeType, ok := g.getCachedVisual(cacheKey); ok {
return data, mimeType, nil
}
data, mimeType, err := extractFrameFromVideo(candidate.PreviewVideoURL)
if err == nil {
g.setCachedVisual(cacheKey, data, mimeType, 10*time.Minute)
return data, mimeType, nil
}
lastErr = err
}
if candidate.ThumbnailURL != "" {
if isLowValueThumbnail(candidate.ThumbnailURL) {
g.debug("gemini:vision_candidate_rejected_low_value", map[string]any{
"link": candidate.Link,
"source": candidate.Source,
"thumbnailUrl": candidate.ThumbnailURL,
})
lastErr = fmt.Errorf("candidate thumbnail is low value")
} else {
cacheKey := "image\n" + candidate.ThumbnailURL
if data, mimeType, ok := g.getCachedVisual(cacheKey); ok {
return data, mimeType, nil
}
data, mimeType, err := fetchImageAsInlineData(g.Client, candidate.ThumbnailURL, candidate.Link)
if err == nil {
g.setCachedVisual(cacheKey, data, mimeType, 10*time.Minute)
return data, mimeType, nil
}
lastErr = err
}
}
if fallbackThumbnail := deriveThumbnail(candidate.Link); fallbackThumbnail != "" && fallbackThumbnail != candidate.ThumbnailURL {
cacheKey := "image\n" + fallbackThumbnail
if data, mimeType, ok := g.getCachedVisual(cacheKey); ok {
return data, mimeType, nil
}
data, mimeType, err := fetchImageAsInlineData(g.Client, fallbackThumbnail, candidate.Link)
if err == nil {
g.setCachedVisual(cacheKey, data, mimeType, 10*time.Minute)
return data, mimeType, nil
}
lastErr = err
}
if candidate.PreviewVideoURL != "" {
cacheKey := "frame\n" + candidate.PreviewVideoURL
if data, mimeType, ok := g.getCachedVisual(cacheKey); ok {
return data, mimeType, nil
}
data, mimeType, err := extractFrameFromVideo(candidate.PreviewVideoURL)
if err != nil {
lastErr = err
} else {
g.setCachedVisual(cacheKey, data, mimeType, 10*time.Minute)
return data, mimeType, nil
}
}
return "", "", lastErr
}
func extractFrameFromVideo(videoURL string) (string, string, error) {
tempDir, err := os.MkdirTemp("", "gemini-frame-*")
if err != nil {
return "", "", err
}
defer os.RemoveAll(tempDir)
framePath := filepath.Join(tempDir, "frame.jpg")
cmd := exec.Command("ffmpeg", "-y", "-ss", "00:00:00.500", "-i", videoURL, "-frames:v", "1", "-q:v", "2", framePath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", "", fmt.Errorf("ffmpeg frame extraction failed: %s", strings.TrimSpace(string(output)))
}
data, err := os.ReadFile(framePath)
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), "image/jpeg", nil
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func extractJSONObject(text string) (string, error) {
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
start := strings.Index(cleaned, "{")
if start == -1 {
return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200))
}
depth := 0
inString := false
escaped := false
for i := start; i < len(cleaned); i++ {
ch := cleaned[i]
if escaped {
escaped = false
continue
}
if ch == '\\' && inString {
escaped = true
continue
}
if ch == '"' {
inString = !inString
continue
}
if inString {
continue
}
switch ch {
case '{':
depth++
case '}':
depth--
if depth == 0 {
return cleaned[start : i+1], nil
}
}
}
return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200))
}
func truncateForError(text string, limit int) string {
trimmed := strings.TrimSpace(text)
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func normalizeKoreanReason(reason string) string {
trimmed := strings.TrimSpace(reason)
if trimmed == "" {
return "시각 정보가 제한적이지만 검색 의도와의 관련성을 기준으로 평가했습니다."
}
return trimmed
}
func normalizeAssessment(assessment string, recommended bool) string {
switch strings.ToLower(strings.TrimSpace(assessment)) {
case "positive", "unclear", "irrelevant", "inappropriate":
return strings.ToLower(strings.TrimSpace(assessment))
}
if recommended {
return "positive"
}
return "unclear"
}
func normalizeSearchHint(text string) string {
trimmed := strings.Join(strings.Fields(strings.TrimSpace(strings.Trim(text, "\"'`"))), " ")
if trimmed == "" {
return ""
}
if len(trimmed) > 80 {
return trimmed[:80]
}
return trimmed
}
func parseSupplementalQueryLines(text string) []string {
lines := strings.Split(text, "\n")
seen := map[string]bool{}
queries := make([]string, 0, 5)
for _, line := range lines {
trimmed := strings.TrimSpace(strings.Trim(line, "\"'`-0123456789. "))
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
if len(queries) >= 5 {
break
}
}
return queries
}
func buildSearchQueries(originalQuery, englishQuery string) []string {
base := strings.TrimSpace(englishQuery)
if base == "" {
base = strings.TrimSpace(originalQuery)
}
candidates := []string{
base,
strings.ReplaceAll(base, "pov", "point of view"),
base + " stock footage",
base + " b-roll",
base + " cinematic footage",
base + " editorial footage",
base + " establishing shot",
}
seen := map[string]bool{}
queries := make([]string, 0, len(candidates))
for _, item := range candidates {
trimmed := strings.TrimSpace(strings.Join(strings.Fields(item), " "))
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
}
return queries
}
func sanitizePlainEnglishLine(text string) string {
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(strings.Trim(line, "\"'`"))
if line == "" {
continue
}
lower := strings.ToLower(line)
for _, prefix := range []string{"translation:", "english:", "translated query:"} {
if strings.HasPrefix(lower, prefix) {
line = strings.TrimSpace(line[len(prefix):])
lower = strings.ToLower(line)
}
}
if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") {
continue
}
if line != "" {
return line
}
}
return ""
}
func looksMostlyASCII(text string) bool {
ascii := 0
runes := []rune(text)
for _, r := range runes {
if r <= 127 {
ascii++
}
}
return ascii >= len(runes)*8/10
}
func isLikelyEnglishQuery(text string) bool {
alpha := 0
nonASCII := 0
for _, r := range text {
switch {
case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z':
alpha++
case r > 127:
nonASCII++
}
}
return alpha > 0 && nonASCII == 0
}
func translateKoreanMediaTerms(query string) string {
replacements := []struct {
korean string
english string
}{
{korean: "사이버 펑크 도시", english: "cyberpunk city"},
{korean: "사이버펑크 도시", english: "cyberpunk city"},
{korean: "사이버 펑크", english: "cyberpunk"},
{korean: "사이버펑크", english: "cyberpunk"},
{korean: "네온 도시", english: "neon city"},
{korean: "미래 도시", english: "futuristic city"},
{korean: "숲속", english: "forest"},
{korean: "다정한", english: "affectionate"},
{korean: "항공샷", english: "aerial shot"},
{korean: "사람들", english: "people"},
{korean: "행복한", english: "happy"},
{korean: "커플", english: "couple"},
{korean: "연인", english: "lovers"},
{korean: "도시", english: "city"},
{korean: "야경", english: "night city"},
{korean: "거리", english: "street"},
{korean: "골목", english: "alley"},
{korean: "바다", english: "ocean"},
{korean: "해변", english: "beach"},
{korean: "노을", english: "sunset"},
{korean: "자연", english: "nature"},
{korean: "드론", english: "drone"},
{korean: "인파", english: "crowd"},
{korean: "공원", english: "park"},
{korean: "숲", english: "forest"},
{korean: "비", english: "rain"},
{korean: "눈", english: "snow"},
{korean: "산", english: "mountain"},
}
sort.SliceStable(replacements, func(i, j int) bool {
return len([]rune(replacements[i].korean)) > len([]rune(replacements[j].korean))
})
translated := strings.TrimSpace(query)
for _, replacement := range replacements {
translated = strings.ReplaceAll(translated, replacement.korean, replacement.english)
}
translated = strings.Join(strings.Fields(translated), " ")
return strings.TrimSpace(translated)
}
func normalizeKnownMediaPhrases(query string) string {
normalized := strings.TrimSpace(query)
replacements := []struct {
from string
to string
}{
{from: "사이버 펑크 도시", to: "cyberpunk city"},
{from: "사이버펑크 도시", to: "cyberpunk city"},
{from: "사이버 펑크", to: "cyberpunk"},
{from: "사이버펑크", to: "cyberpunk"},
}
for _, replacement := range replacements {
normalized = strings.ReplaceAll(normalized, replacement.from, replacement.to)
}
return strings.Join(strings.Fields(normalized), " ")
}
func isOvercompressedTranslation(original, translated string) bool {
originalWords := len(strings.Fields(strings.TrimSpace(original)))
translatedWords := len(strings.Fields(strings.TrimSpace(translated)))
if originalWords < 2 || translatedWords >= 2 {
return false
}
lower := strings.ToLower(strings.TrimSpace(translated))
for _, allow := range []string{"cyberpunk", "nightlife", "cityscape"} {
if lower == allow {
return false
}
}
return true
}
func (g *GeminiService) translateViaGoogle(query string) (string, error) {
return g.translateViaGoogleToTarget(query, "en")
}
func (g *GeminiService) translateViaGoogleToTarget(query, targetLanguage string) (string, error) {
baseURL := g.TranslateEndpoint
if strings.TrimSpace(baseURL) == "" {
baseURL = "https://translate.googleapis.com/translate_a/single"
}
targetLanguage = strings.TrimSpace(targetLanguage)
if targetLanguage == "" {
targetLanguage = "en"
}
endpoint := baseURL + "?client=gtx&sl=auto&tl=" + neturl.QueryEscape(targetLanguage) + "&dt=t&q=" + neturl.QueryEscape(query)
resp, err := g.Client.Get(endpoint)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode)
}
var payload []any
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", err
}
if len(payload) == 0 {
return "", fmt.Errorf("google translate fallback returned no payload")
}
top, ok := payload[0].([]any)
if !ok {
return "", fmt.Errorf("google translate fallback returned unexpected payload")
}
var builder strings.Builder
for _, part := range top {
segment, ok := part.([]any)
if !ok || len(segment) == 0 {
continue
}
if text, ok := segment[0].(string); ok {
builder.WriteString(text)
}
}
translated := strings.TrimSpace(builder.String())
if translated == "" {
return "", fmt.Errorf("google translate fallback returned empty translation")
}
return translated, nil
}