package services import ( "bytes" "encoding/base64" "encoding/json" "fmt" "io" "mime" "net/http" "strings" "time" ) type GeminiService struct { APIKey string Client *http.Client } type AIRecommendation struct { Title string `json:"title"` Link string `json:"link"` ThumbnailURL string `json:"thumbnailUrl"` Source string `json:"source"` Reason string `json:"reason"` Recommended bool `json:"recommended"` } type QueryExpansion struct { Querywords []string `json:"querywords"` } func NewGeminiService(apiKey string) *GeminiService { return &GeminiService{ APIKey: apiKey, Client: &http.Client{Timeout: 40 * time.Second}, } } func (g *GeminiService) ExpandQuery(query string) ([]string, error) { if g.APIKey == "" { return []string{query}, nil } body := map[string]any{ "contents": []map[string]any{ { "parts": []map[string]string{ { "text": `Return JSON only in this shape: {"querywords":["..."]}. Generate at most 10 concise search variations for media discovery across Google Video, Envato, and Artgrid. If the user query is in Korean, include strong English search variants that a stock footage editor would use. Prioritize media, video footage, stock footage, cinematic b-roll, editorial footage, and scene-based search terms. Avoid celebrity gossip, reaction-style phrasing, clickbait phrasing, and generic web search wording. Mix Korean and English when useful, but make sure several queries are clean English production keywords. User query: ` + query, }, }, }, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", "temperature": 0.2, "maxOutputTokens": 220, }, } rawText, err := g.generateText(body) if err != nil { return []string{query}, err } jsonText, err := extractJSONObject(rawText) if err != nil { strictBody := map[string]any{ "contents": []map[string]any{ { "parts": []map[string]string{ { "text": `STRICT JSON ONLY. Output must start with { and end with }. Do not add prose, explanations, markdown, code fences, or labels. Return exactly this shape: {"querywords":["..."]}. Generate up to 10 search queries for media discovery across Google Video, Envato, and Artgrid. If the original query is Korean, include strong English stock-footage search phrases. User query: ` + query, }, }, }, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", "temperature": 0.1, "maxOutputTokens": 220, }, } rawText, retryErr := g.generateText(strictBody) if retryErr != nil { return []string{query}, retryErr } jsonText, err = extractJSONObject(rawText) if err != nil { return []string{query}, fmt.Errorf("gemini query expansion JSON extraction failed after strict retry: %w", err) } } var parsed QueryExpansion if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil { return []string{query}, fmt.Errorf("gemini query expansion JSON parse failed: %w; raw=%q", err, truncateForError(rawText, 200)) } queries := []string{query} seen := map[string]bool{strings.ToLower(strings.TrimSpace(query)): true} for _, item := range parsed.Querywords { trimmed := strings.TrimSpace(item) if trimmed == "" { continue } key := strings.ToLower(trimmed) if seen[key] { continue } seen[key] = true queries = append(queries, trimmed) } return queries, nil } func (g *GeminiService) generateText(body map[string]any) (string, error) { rawBody, _ := json.Marshal(body) endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody)) if err != nil { return "", fmt.Errorf("gemini request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode >= 300 { data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data))) } var payload struct { Candidates []struct { Content struct { Parts []struct { Text string `json:"text"` } `json:"parts"` } `json:"content"` } `json:"candidates"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return "", fmt.Errorf("gemini response decode failed: %w", err) } if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 { return "", fmt.Errorf("gemini returned no candidates") } return payload.Candidates[0].Content.Parts[0].Text, nil } func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) { if g.APIKey == "" { return nil, fmt.Errorf("gemini api key is not configured") } if len(candidates) == 0 { return []AIRecommendation{}, nil } type geminiPart map[string]any parts := []geminiPart{ { "text": `Analyze the provided images for the user's search intent. Return JSON only in this shape: {"recommendations":[{"index":0,"reason":"short reason","recommended":true}]} Mark only the best matches as recommended=true. Keep reasons concise. Recommend up to 8 items. Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails. Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery. Favor thumbnails that look directly useful for media editing and footage sourcing. User query: ` + query, }, } maxImages := min(len(candidates), 10) for idx := 0; idx < maxImages; idx++ { img, mimeType, err := fetchImageAsInlineData(g.Client, candidates[idx].ThumbnailURL) if err != nil { continue } parts = append(parts, geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)}, geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}}, ) } body := map[string]any{ "contents": []map[string]any{ {"parts": parts}, }, "generationConfig": map[string]any{ "responseMimeType": "application/json", }, } rawBody, _ := json.Marshal(body) endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody)) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode >= 300 { data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data))) } var payload struct { Candidates []struct { Content struct { Parts []struct { Text string `json:"text"` } `json:"parts"` } `json:"content"` } `json:"candidates"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return nil, fmt.Errorf("gemini vision response decode failed: %w", err) } if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 { return nil, fmt.Errorf("gemini vision returned no candidates") } jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text) if err != nil { return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err) } var parsed struct { Recommendations []struct { Index int `json:"index"` Reason string `json:"reason"` Recommended bool `json:"recommended"` } `json:"recommendations"` } if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil { return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200)) } recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations)) for _, rec := range parsed.Recommendations { if rec.Index < 0 || rec.Index >= len(candidates) || !rec.Recommended { continue } src := candidates[rec.Index] recommendations = append(recommendations, AIRecommendation{ Title: src.Title, Link: src.Link, ThumbnailURL: src.ThumbnailURL, Source: src.Source, Reason: rec.Reason, Recommended: true, }) } if len(recommendations) == 0 { for _, candidate := range candidates[:min(4, len(candidates))] { recommendations = append(recommendations, AIRecommendation{ Title: candidate.Title, Link: candidate.Link, ThumbnailURL: candidate.ThumbnailURL, Source: candidate.Source, Reason: "Fallback result because Gemini returned no recommended items.", Recommended: true, }) } } return recommendations, nil } func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) { resp, err := client.Get(imageURL) if err != nil { return "", "", err } defer resp.Body.Close() if resp.StatusCode >= 300 { return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode) } contentType := resp.Header.Get("Content-Type") mimeType, _, _ := mime.ParseMediaType(contentType) if mimeType == "" || !strings.HasPrefix(mimeType, "image/") { mimeType = "image/jpeg" } data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024)) if err != nil { return "", "", err } return base64.StdEncoding.EncodeToString(data), mimeType, nil } func min(a, b int) int { if a < b { return a } return b } func extractJSONObject(text string) (string, error) { cleaned := strings.TrimSpace(text) cleaned = strings.TrimPrefix(cleaned, "```json") cleaned = strings.TrimPrefix(cleaned, "```") cleaned = strings.TrimSuffix(cleaned, "```") cleaned = strings.TrimSpace(cleaned) start := strings.Index(cleaned, "{") if start == -1 { return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200)) } depth := 0 inString := false escaped := false for i := start; i < len(cleaned); i++ { ch := cleaned[i] if escaped { escaped = false continue } if ch == '\\' && inString { escaped = true continue } if ch == '"' { inString = !inString continue } if inString { continue } switch ch { case '{': depth++ case '}': depth-- if depth == 0 { return cleaned[start : i+1], nil } } } return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200)) } func truncateForError(text string, limit int) string { trimmed := strings.TrimSpace(text) if len(trimmed) <= limit { return trimmed } return trimmed[:limit] + "..." }