package services import ( "bytes" "encoding/base64" "encoding/json" "fmt" "io" "mime" "net/http" neturl "net/url" "os" "os/exec" "path/filepath" "sort" "strings" "time" ) type GeminiService struct { APIKey string Client *http.Client GenerateEndpoint string TranslateEndpoint string } type AIRecommendation struct { Title string `json:"title"` Link string `json:"link"` Snippet string `json:"snippet"` ThumbnailURL string `json:"thumbnailUrl"` PreviewVideoURL string `json:"previewVideoUrl"` Source string `json:"source"` Reason string `json:"reason"` Recommended bool `json:"recommended"` } type QueryExpansion struct { Querywords []string `json:"querywords"` } func NewGeminiService(apiKey string) *GeminiService { return &GeminiService{ APIKey: apiKey, Client: &http.Client{Timeout: 40 * time.Second}, GenerateEndpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent", TranslateEndpoint: "https://translate.googleapis.com/translate_a/single", } } func (g *GeminiService) ExpandQuery(query string) ([]string, error) { englishBase := g.TranslateQuery(query) return buildSearchQueries(query, englishBase), nil } func (g *GeminiService) TranslateQuery(query string) string { trimmed := strings.TrimSpace(query) if trimmed == "" { return "" } normalizedIntent := normalizeKnownMediaPhrases(trimmed) if looksMostlyASCII(normalizedIntent) { return strings.TrimSpace(normalizedIntent) } if looksMostlyASCII(trimmed) { return trimmed } if g.APIKey != "" { body := map[string]any{ "systemInstruction": map[string]any{ "parts": []map[string]string{ { "text": "You translate media search intents into natural English. Output one plain English search phrase only. No labels, no quotes, no explanations.", }, }, }, "contents": []map[string]any{ { "parts": []map[string]string{ { "text": "Translate this user query into concise English suitable for stock-footage search: " + trimmed, }, }, }, }, "generationConfig": map[string]any{ "responseMimeType": "text/plain", "temperature": 0.1, "maxOutputTokens": 40, }, } rawText, err := g.generateText(body) if err == nil { translated := sanitizePlainEnglishLine(rawText) if translated != "" && !strings.EqualFold(translated, trimmed) && !isOvercompressedTranslation(trimmed, translated) { return translated } } } if translated, err := g.translateViaGoogle(trimmed); err == nil && translated != "" && isLikelyEnglishQuery(translated) && !isOvercompressedTranslation(trimmed, translated) { return translated } if translated := translateKoreanMediaTerms(normalizedIntent); translated != "" && !strings.EqualFold(translated, trimmed) { return translated } return strings.TrimSpace(normalizedIntent) } func (g *GeminiService) generateText(body map[string]any) (string, error) { rawBody, _ := json.Marshal(body) endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody)) if err != nil { return "", fmt.Errorf("gemini request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode >= 300 { data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data))) } var payload struct { Candidates []struct { Content struct { Parts []struct { Text string `json:"text"` } `json:"parts"` } `json:"content"` } `json:"candidates"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return "", fmt.Errorf("gemini response decode failed: %w", err) } if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 { return "", fmt.Errorf("gemini returned no candidates") } return payload.Candidates[0].Content.Parts[0].Text, nil } func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) { if g.APIKey == "" { return nil, fmt.Errorf("gemini api key is not configured") } if len(candidates) == 0 { return []AIRecommendation{}, nil } type geminiPart map[string]any parts := []geminiPart{ { "text": `Analyze the provided images for the user's search intent. Return JSON only in this shape: {"recommendations":[{"index":0,"reason":"short reason","recommended":true}]} Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness. Mark the strongest matches as recommended=true and weaker matches as recommended=false. Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails. Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery. Favor thumbnails that look directly useful for media editing and footage sourcing. User query: ` + query, }, } maxImages := min(len(candidates), 10) visualCount := 0 for idx := 0; idx < maxImages; idx++ { img, mimeType, err := fetchCandidateVisualInlineData(g.Client, candidates[idx]) if err != nil { continue } visualCount++ parts = append(parts, geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)}, geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}}, ) } if visualCount == 0 { return nil, fmt.Errorf("no candidate thumbnails or preview frames could be fetched for gemini vision") } body := map[string]any{ "contents": []map[string]any{ {"parts": parts}, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", }, } rawBody, _ := json.Marshal(body) endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody)) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode >= 300 { data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data))) } var payload struct { Candidates []struct { Content struct { Parts []struct { Text string `json:"text"` } `json:"parts"` } `json:"content"` } `json:"candidates"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return nil, fmt.Errorf("gemini vision response decode failed: %w", err) } if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 { return nil, fmt.Errorf("gemini vision returned no candidates") } jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text) if err != nil { return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err) } var parsed struct { Recommendations []struct { Index int `json:"index"` Reason string `json:"reason"` Recommended bool `json:"recommended"` } `json:"recommendations"` } if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil { return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200)) } recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations)) for _, rec := range parsed.Recommendations { if rec.Index < 0 || rec.Index >= len(candidates) { continue } src := candidates[rec.Index] recommendations = append(recommendations, AIRecommendation{ Title: src.Title, Link: src.Link, Snippet: src.Snippet, ThumbnailURL: src.ThumbnailURL, PreviewVideoURL: src.PreviewVideoURL, Source: src.Source, Reason: normalizeKoreanReason(rec.Reason), Recommended: rec.Recommended, }) } if len(recommendations) == 0 { recommendations = BuildFallbackRecommendations(candidates, 8, "Gemini Vision 평가를 받지 못해 키워드 기준으로 보강된 결과입니다.") } return recommendations, nil } func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) { if strings.TrimSpace(imageURL) == "" { return "", "", fmt.Errorf("image url is empty") } resp, err := client.Get(imageURL) if err == nil { defer resp.Body.Close() } if err != nil || resp.StatusCode >= 300 { req, reqErr := newBrowserStyleImageRequest(imageURL) if reqErr != nil { if err != nil { return "", "", err } return "", "", reqErr } if resp != nil { resp.Body.Close() } resp, err = client.Do(req) if err != nil { return "", "", err } defer resp.Body.Close() } if resp.StatusCode >= 300 { return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode) } contentType := resp.Header.Get("Content-Type") mimeType, _, _ := mime.ParseMediaType(contentType) if mimeType == "" || !strings.HasPrefix(mimeType, "image/") { mimeType = "image/jpeg" } data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024)) if err != nil { return "", "", err } return base64.StdEncoding.EncodeToString(data), mimeType, nil } func newBrowserStyleImageRequest(imageURL string) (*http.Request, error) { req, err := http.NewRequest(http.MethodGet, imageURL, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") req.Header.Set("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8") req.Header.Set("Accept-Language", "en-US,en;q=0.9") return req, nil } func fetchCandidateVisualInlineData(client *http.Client, candidate SearchResult) (string, string, error) { if candidate.ThumbnailURL != "" { data, mimeType, err := fetchImageAsInlineData(client, candidate.ThumbnailURL) if err == nil { return data, mimeType, nil } } if candidate.PreviewVideoURL != "" { return extractFrameFromVideo(candidate.PreviewVideoURL) } return "", "", fmt.Errorf("candidate has no thumbnail or preview video") } func extractFrameFromVideo(videoURL string) (string, string, error) { tempDir, err := os.MkdirTemp("", "gemini-frame-*") if err != nil { return "", "", err } defer os.RemoveAll(tempDir) framePath := filepath.Join(tempDir, "frame.jpg") cmd := exec.Command("ffmpeg", "-y", "-ss", "00:00:00.500", "-i", videoURL, "-frames:v", "1", "-q:v", "2", framePath) output, err := cmd.CombinedOutput() if err != nil { return "", "", fmt.Errorf("ffmpeg frame extraction failed: %s", strings.TrimSpace(string(output))) } data, err := os.ReadFile(framePath) if err != nil { return "", "", err } return base64.StdEncoding.EncodeToString(data), "image/jpeg", nil } func min(a, b int) int { if a < b { return a } return b } func extractJSONObject(text string) (string, error) { cleaned := strings.TrimSpace(text) cleaned = strings.TrimPrefix(cleaned, "```json") cleaned = strings.TrimPrefix(cleaned, "```") cleaned = strings.TrimSuffix(cleaned, "```") cleaned = strings.TrimSpace(cleaned) start := strings.Index(cleaned, "{") if start == -1 { return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200)) } depth := 0 inString := false escaped := false for i := start; i < len(cleaned); i++ { ch := cleaned[i] if escaped { escaped = false continue } if ch == '\\' && inString { escaped = true continue } if ch == '"' { inString = !inString continue } if inString { continue } switch ch { case '{': depth++ case '}': depth-- if depth == 0 { return cleaned[start : i+1], nil } } } return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200)) } func truncateForError(text string, limit int) string { trimmed := strings.TrimSpace(text) if len(trimmed) <= limit { return trimmed } return trimmed[:limit] + "..." } func normalizeKoreanReason(reason string) string { trimmed := strings.TrimSpace(reason) if trimmed == "" { return "시각 정보가 제한적이지만 검색 의도와의 관련성을 기준으로 평가했습니다." } return trimmed } func buildSearchQueries(originalQuery, englishQuery string) []string { base := strings.TrimSpace(englishQuery) if base == "" { base = strings.TrimSpace(originalQuery) } candidates := []string{ base, strings.ReplaceAll(base, "pov", "point of view"), base + " stock footage", base + " b-roll", base + " cinematic footage", base + " editorial footage", base + " establishing shot", } seen := map[string]bool{} queries := make([]string, 0, len(candidates)) for _, item := range candidates { trimmed := strings.TrimSpace(strings.Join(strings.Fields(item), " ")) if trimmed == "" { continue } key := strings.ToLower(trimmed) if seen[key] { continue } seen[key] = true queries = append(queries, trimmed) } return queries } func sanitizePlainEnglishLine(text string) string { lines := strings.Split(text, "\n") for _, line := range lines { line = strings.TrimSpace(strings.Trim(line, "\"'`")) if line == "" { continue } lower := strings.ToLower(line) for _, prefix := range []string{"translation:", "english:", "translated query:"} { if strings.HasPrefix(lower, prefix) { line = strings.TrimSpace(line[len(prefix):]) lower = strings.ToLower(line) } } if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") { continue } if line != "" { return line } } return "" } func looksMostlyASCII(text string) bool { ascii := 0 runes := []rune(text) for _, r := range runes { if r <= 127 { ascii++ } } return ascii >= len(runes)*8/10 } func isLikelyEnglishQuery(text string) bool { alpha := 0 nonASCII := 0 for _, r := range text { switch { case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z': alpha++ case r > 127: nonASCII++ } } return alpha > 0 && nonASCII == 0 } func translateKoreanMediaTerms(query string) string { replacements := []struct { korean string english string }{ {korean: "사이버 펑크 도시", english: "cyberpunk city"}, {korean: "사이버펑크 도시", english: "cyberpunk city"}, {korean: "사이버 펑크", english: "cyberpunk"}, {korean: "사이버펑크", english: "cyberpunk"}, {korean: "네온 도시", english: "neon city"}, {korean: "미래 도시", english: "futuristic city"}, {korean: "숲속", english: "forest"}, {korean: "다정한", english: "affectionate"}, {korean: "항공샷", english: "aerial shot"}, {korean: "사람들", english: "people"}, {korean: "행복한", english: "happy"}, {korean: "커플", english: "couple"}, {korean: "연인", english: "lovers"}, {korean: "도시", english: "city"}, {korean: "야경", english: "night city"}, {korean: "거리", english: "street"}, {korean: "골목", english: "alley"}, {korean: "바다", english: "ocean"}, {korean: "해변", english: "beach"}, {korean: "노을", english: "sunset"}, {korean: "자연", english: "nature"}, {korean: "드론", english: "drone"}, {korean: "인파", english: "crowd"}, {korean: "공원", english: "park"}, {korean: "숲", english: "forest"}, {korean: "비", english: "rain"}, {korean: "눈", english: "snow"}, {korean: "산", english: "mountain"}, } sort.SliceStable(replacements, func(i, j int) bool { return len([]rune(replacements[i].korean)) > len([]rune(replacements[j].korean)) }) translated := strings.TrimSpace(query) for _, replacement := range replacements { translated = strings.ReplaceAll(translated, replacement.korean, replacement.english) } translated = strings.Join(strings.Fields(translated), " ") return strings.TrimSpace(translated) } func normalizeKnownMediaPhrases(query string) string { normalized := strings.TrimSpace(query) replacements := []struct { from string to string }{ {from: "사이버 펑크 도시", to: "cyberpunk city"}, {from: "사이버펑크 도시", to: "cyberpunk city"}, {from: "사이버 펑크", to: "cyberpunk"}, {from: "사이버펑크", to: "cyberpunk"}, } for _, replacement := range replacements { normalized = strings.ReplaceAll(normalized, replacement.from, replacement.to) } return strings.Join(strings.Fields(normalized), " ") } func isOvercompressedTranslation(original, translated string) bool { originalWords := len(strings.Fields(strings.TrimSpace(original))) translatedWords := len(strings.Fields(strings.TrimSpace(translated))) if originalWords < 2 || translatedWords >= 2 { return false } lower := strings.ToLower(strings.TrimSpace(translated)) for _, allow := range []string{"cyberpunk", "nightlife", "cityscape"} { if lower == allow { return false } } return true } func (g *GeminiService) translateViaGoogle(query string) (string, error) { baseURL := g.TranslateEndpoint if strings.TrimSpace(baseURL) == "" { baseURL = "https://translate.googleapis.com/translate_a/single" } endpoint := baseURL + "?client=gtx&sl=auto&tl=en&dt=t&q=" + neturl.QueryEscape(query) resp, err := g.Client.Get(endpoint) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode >= 300 { return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode) } var payload []any if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return "", err } if len(payload) == 0 { return "", fmt.Errorf("google translate fallback returned no payload") } top, ok := payload[0].([]any) if !ok { return "", fmt.Errorf("google translate fallback returned unexpected payload") } var builder strings.Builder for _, part := range top { segment, ok := part.([]any) if !ok || len(segment) == 0 { continue } if text, ok := segment[0].(string); ok { builder.WriteString(text) } } translated := strings.TrimSpace(builder.String()) if translated == "" { return "", fmt.Errorf("google translate fallback returned empty translation") } return translated, nil }