package services import ( "bytes" "encoding/base64" "encoding/json" "fmt" "io" "mime" "net/http" neturl "net/url" "strings" "time" ) type GeminiService struct { APIKey string Client *http.Client } type AIRecommendation struct { Title string `json:"title"` Link string `json:"link"` ThumbnailURL string `json:"thumbnailUrl"` Source string `json:"source"` Reason string `json:"reason"` Recommended bool `json:"recommended"` } type QueryExpansion struct { Querywords []string `json:"querywords"` } func NewGeminiService(apiKey string) *GeminiService { return &GeminiService{ APIKey: apiKey, Client: &http.Client{Timeout: 40 * time.Second}, } } func (g *GeminiService) ExpandQuery(query string) ([]string, error) { if g.APIKey == "" { return fallbackQueryExpansion(query, query), nil } englishBase := g.TranslateQuery(query) body := map[string]any{ "systemInstruction": map[string]any{ "parts": []map[string]string{ { "text": "You are a JSON-only API. Output valid JSON only. Never add prose, labels, markdown, or explanations before or after the JSON.", }, }, }, "contents": []map[string]any{ { "parts": []map[string]string{ { "text": `Return JSON only in this shape: {"querywords":["..."]}. Generate at most 10 concise English search variations for media discovery across Google Video, Envato, and Artgrid. The queries must be usable directly in English search engines for stock footage discovery. Prioritize media, video footage, stock footage, cinematic b-roll, editorial footage, and scene-based search terms. Avoid celebrity gossip, reaction-style phrasing, clickbait phrasing, and generic web search wording. Do not output Korean unless it is part of a proper noun. Original user query: ` + query + ` English base translation: ` + englishBase, }, }, }, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", "temperature": 0.2, "maxOutputTokens": 220, "responseSchema": map[string]any{ "type": "OBJECT", "properties": map[string]any{ "querywords": map[string]any{ "type": "ARRAY", "items": map[string]any{ "type": "STRING", }, }, }, "required": []string{"querywords"}, }, }, } rawText, err := g.generateText(body) if err != nil { return fallbackQueryExpansion(query, englishBase), nil } jsonText, err := extractJSONObject(rawText) if err != nil { strictBody := map[string]any{ "systemInstruction": map[string]any{ "parts": []map[string]string{ { "text": "You are a strict JSON emitter. Output one valid JSON object only. Do not write any other text.", }, }, }, "contents": []map[string]any{ { "parts": []map[string]string{ { "text": `STRICT JSON ONLY. Output must start with { and end with }. Do not add prose, explanations, markdown, code fences, or labels. Return exactly this shape: {"querywords":["..."]}. Generate up to 10 search queries for media discovery across Google Video, Envato, and Artgrid. Every query must be in natural English and suitable for stock-footage search. Original user query: ` + query + ` English base translation: ` + englishBase, }, }, }, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", "temperature": 0.1, "maxOutputTokens": 220, "responseSchema": map[string]any{ "type": "OBJECT", "properties": map[string]any{ "querywords": map[string]any{ "type": "ARRAY", "items": map[string]any{ "type": "STRING", }, }, }, "required": []string{"querywords"}, }, }, } rawText, retryErr := g.generateText(strictBody) if retryErr != nil { return fallbackQueryExpansion(query, englishBase), nil } jsonText, err = extractJSONObject(rawText) if err != nil { return fallbackQueryExpansion(query, englishBase), nil } } var parsed QueryExpansion if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil { return fallbackQueryExpansion(query, englishBase), nil } queries := fallbackQueryExpansion(query, englishBase) seen := map[string]bool{} englishOnly := !strings.EqualFold(strings.TrimSpace(englishBase), strings.TrimSpace(query)) for _, existing := range queries { seen[strings.ToLower(strings.TrimSpace(existing))] = true } for _, item := range parsed.Querywords { trimmed := strings.TrimSpace(item) if trimmed == "" { continue } if englishOnly && !isLikelyEnglishQuery(trimmed) { continue } key := strings.ToLower(trimmed) if seen[key] { continue } seen[key] = true queries = append(queries, trimmed) } return queries, nil } func (g *GeminiService) TranslateQuery(query string) string { if strings.TrimSpace(query) == "" || looksMostlyASCII(query) || g.APIKey == "" { return strings.TrimSpace(query) } body := map[string]any{ "systemInstruction": map[string]any{ "parts": []map[string]string{ { "text": "You translate media search intents into natural English. Output one plain English search phrase only. No labels, no quotes, no explanations.", }, }, }, "contents": []map[string]any{ { "parts": []map[string]string{ { "text": "Translate this user query into concise English suitable for stock-footage search: " + query, }, }, }, }, "generationConfig": map[string]any{ "responseMimeType": "text/plain", "temperature": 0.1, "maxOutputTokens": 40, }, } rawText, err := g.generateText(body) if err == nil { translated := sanitizePlainEnglishLine(rawText) if translated != "" && !strings.EqualFold(translated, strings.TrimSpace(query)) { return translated } } if translated, err := g.translateViaGoogle(query); err == nil && translated != "" { return translated } if translated := translateKoreanMediaTerms(query); translated != "" && !strings.EqualFold(translated, strings.TrimSpace(query)) { return translated } return strings.TrimSpace(query) } func (g *GeminiService) generateText(body map[string]any) (string, error) { rawBody, _ := json.Marshal(body) endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody)) if err != nil { return "", fmt.Errorf("gemini request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode >= 300 { data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data))) } var payload struct { Candidates []struct { Content struct { Parts []struct { Text string `json:"text"` } `json:"parts"` } `json:"content"` } `json:"candidates"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return "", fmt.Errorf("gemini response decode failed: %w", err) } if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 { return "", fmt.Errorf("gemini returned no candidates") } return payload.Candidates[0].Content.Parts[0].Text, nil } func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) { if g.APIKey == "" { return nil, fmt.Errorf("gemini api key is not configured") } if len(candidates) == 0 { return []AIRecommendation{}, nil } type geminiPart map[string]any parts := []geminiPart{ { "text": `Analyze the provided images for the user's search intent. Return JSON only in this shape: {"recommendations":[{"index":0,"reason":"short reason","recommended":true}]} Mark only the best matches as recommended=true. Keep reasons concise. Recommend up to 8 items. Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails. Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery. Favor thumbnails that look directly useful for media editing and footage sourcing. User query: ` + query, }, } maxImages := min(len(candidates), 10) for idx := 0; idx < maxImages; idx++ { img, mimeType, err := fetchImageAsInlineData(g.Client, candidates[idx].ThumbnailURL) if err != nil { continue } parts = append(parts, geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)}, geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}}, ) } body := map[string]any{ "contents": []map[string]any{ {"parts": parts}, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", }, } rawBody, _ := json.Marshal(body) endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody)) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode >= 300 { data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data))) } var payload struct { Candidates []struct { Content struct { Parts []struct { Text string `json:"text"` } `json:"parts"` } `json:"content"` } `json:"candidates"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return nil, fmt.Errorf("gemini vision response decode failed: %w", err) } if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 { return nil, fmt.Errorf("gemini vision returned no candidates") } jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text) if err != nil { return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err) } var parsed struct { Recommendations []struct { Index int `json:"index"` Reason string `json:"reason"` Recommended bool `json:"recommended"` } `json:"recommendations"` } if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil { return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200)) } recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations)) for _, rec := range parsed.Recommendations { if rec.Index < 0 || rec.Index >= len(candidates) || !rec.Recommended { continue } src := candidates[rec.Index] recommendations = append(recommendations, AIRecommendation{ Title: src.Title, Link: src.Link, ThumbnailURL: src.ThumbnailURL, Source: src.Source, Reason: rec.Reason, Recommended: true, }) } if len(recommendations) == 0 { for _, candidate := range candidates[:min(4, len(candidates))] { recommendations = append(recommendations, AIRecommendation{ Title: candidate.Title, Link: candidate.Link, ThumbnailURL: candidate.ThumbnailURL, Source: candidate.Source, Reason: "Fallback result because Gemini returned no recommended items.", Recommended: true, }) } } return recommendations, nil } func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) { resp, err := client.Get(imageURL) if err != nil { return "", "", err } defer resp.Body.Close() if resp.StatusCode >= 300 { return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode) } contentType := resp.Header.Get("Content-Type") mimeType, _, _ := mime.ParseMediaType(contentType) if mimeType == "" || !strings.HasPrefix(mimeType, "image/") { mimeType = "image/jpeg" } data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024)) if err != nil { return "", "", err } return base64.StdEncoding.EncodeToString(data), mimeType, nil } func min(a, b int) int { if a < b { return a } return b } func extractJSONObject(text string) (string, error) { cleaned := strings.TrimSpace(text) cleaned = strings.TrimPrefix(cleaned, "```json") cleaned = strings.TrimPrefix(cleaned, "```") cleaned = strings.TrimSuffix(cleaned, "```") cleaned = strings.TrimSpace(cleaned) start := strings.Index(cleaned, "{") if start == -1 { return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200)) } depth := 0 inString := false escaped := false for i := start; i < len(cleaned); i++ { ch := cleaned[i] if escaped { escaped = false continue } if ch == '\\' && inString { escaped = true continue } if ch == '"' { inString = !inString continue } if inString { continue } switch ch { case '{': depth++ case '}': depth-- if depth == 0 { return cleaned[start : i+1], nil } } } return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200)) } func truncateForError(text string, limit int) string { trimmed := strings.TrimSpace(text) if len(trimmed) <= limit { return trimmed } return trimmed[:limit] + "..." } func fallbackQueryExpansion(originalQuery, englishQuery string) []string { base := strings.TrimSpace(englishQuery) if base == "" { base = strings.TrimSpace(originalQuery) } candidates := []string{ base, base + " b-roll", base + " stock footage", base + " cinematic footage", base + " establishing shot", base + " editorial footage", base + " urban scene", base + " ambient footage", base + " 4k footage", base + " cinematic b-roll", } seen := map[string]bool{} queries := make([]string, 0, len(candidates)) for _, item := range candidates { trimmed := strings.TrimSpace(item) if trimmed == "" { continue } key := strings.ToLower(trimmed) if seen[key] { continue } seen[key] = true queries = append(queries, trimmed) } return queries } func sanitizePlainEnglishLine(text string) string { lines := strings.Split(text, "\n") for _, line := range lines { line = strings.TrimSpace(strings.Trim(line, "\"'`")) if line == "" { continue } lower := strings.ToLower(line) for _, prefix := range []string{"translation:", "english:", "translated query:"} { if strings.HasPrefix(lower, prefix) { line = strings.TrimSpace(line[len(prefix):]) lower = strings.ToLower(line) } } if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") { continue } if line != "" { return line } } return "" } func looksMostlyASCII(text string) bool { ascii := 0 runes := []rune(text) for _, r := range runes { if r <= 127 { ascii++ } } return ascii >= len(runes)*8/10 } func isLikelyEnglishQuery(text string) bool { alpha := 0 nonASCII := 0 for _, r := range text { switch { case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z': alpha++ case r > 127: nonASCII++ } } return alpha > 0 && nonASCII == 0 } func translateKoreanMediaTerms(query string) string { replacements := map[string]string{ "숲속": "forest", "숲": "forest", "다정한": "affectionate", "커플": "couple", "도시": "city", "야경": "night city", "거리": "street", "골목": "alley", "바다": "ocean", "해변": "beach", "노을": "sunset", "자연": "nature", "비": "rain", "눈": "snow", "드론": "drone", "항공샷": "aerial shot", "사람들": "people", "인파": "crowd", "행복한": "happy", "연인": "lovers", "공원": "park", "산": "mountain", } translated := strings.TrimSpace(query) for korean, english := range replacements { translated = strings.ReplaceAll(translated, korean, english) } translated = strings.Join(strings.Fields(translated), " ") return strings.TrimSpace(translated) } func (g *GeminiService) translateViaGoogle(query string) (string, error) { endpoint := "https://translate.googleapis.com/translate_a/single?client=gtx&sl=auto&tl=en&dt=t&q=" + neturl.QueryEscape(query) resp, err := g.Client.Get(endpoint) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode >= 300 { return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode) } var payload []any if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return "", err } if len(payload) == 0 { return "", fmt.Errorf("google translate fallback returned no payload") } top, ok := payload[0].([]any) if !ok { return "", fmt.Errorf("google translate fallback returned unexpected payload") } var builder strings.Builder for _, part := range top { segment, ok := part.([]any) if !ok || len(segment) == 0 { continue } if text, ok := segment[0].(string); ok { builder.WriteString(text) } } translated := strings.TrimSpace(builder.String()) if translated == "" { return "", fmt.Errorf("google translate fallback returned empty translation") } return translated, nil }