Files
ai-media-hub/backend/services/gemini.go
T
AI Assistant 129507357e
build-push / docker (push) Successful in 4m10s
Harden Gemini JSON parsing
2026-03-13 11:02:50 +09:00

326 lines
9.5 KiB
Go

package services
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
"strings"
"time"
)
type GeminiService struct {
APIKey string
Client *http.Client
}
type AIRecommendation struct {
Title string `json:"title"`
Link string `json:"link"`
ThumbnailURL string `json:"thumbnailUrl"`
Source string `json:"source"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
}
type QueryExpansion struct {
Querywords []string `json:"querywords"`
}
func NewGeminiService(apiKey string) *GeminiService {
return &GeminiService{
APIKey: apiKey,
Client: &http.Client{Timeout: 40 * time.Second},
}
}
func (g *GeminiService) ExpandQuery(query string) ([]string, error) {
if g.APIKey == "" {
return []string{query}, nil
}
body := map[string]any{
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": `Return JSON only in this shape: {"querywords":["..."]}.
Generate at most 10 concise search variations for media discovery across Google Video, Envato, and Artgrid.
If the user query is in Korean, include strong English search variants that a stock footage editor would use.
Prioritize media, video footage, stock footage, cinematic b-roll, editorial footage, and scene-based search terms.
Avoid celebrity gossip, reaction-style phrasing, clickbait phrasing, and generic web search wording.
Mix Korean and English when useful, but make sure several queries are clean English production keywords.
User query: ` + query,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
"temperature": 0.2,
"maxOutputTokens": 220,
},
}
rawBody, _ := json.Marshal(body)
endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return []string{query}, fmt.Errorf("gemini query expansion request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return []string{query}, fmt.Errorf("gemini query expansion returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return []string{query}, fmt.Errorf("gemini query expansion response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return []string{query}, fmt.Errorf("gemini query expansion returned no candidates")
}
jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text)
if err != nil {
return []string{query}, fmt.Errorf("gemini query expansion JSON extraction failed: %w", err)
}
var parsed QueryExpansion
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
return []string{query}, fmt.Errorf("gemini query expansion JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200))
}
queries := []string{query}
seen := map[string]bool{strings.ToLower(strings.TrimSpace(query)): true}
for _, item := range parsed.Querywords {
trimmed := strings.TrimSpace(item)
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
}
return queries, nil
}
func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) {
if g.APIKey == "" {
return nil, fmt.Errorf("gemini api key is not configured")
}
if len(candidates) == 0 {
return []AIRecommendation{}, nil
}
type geminiPart map[string]any
parts := []geminiPart{
{
"text": `Analyze the provided images for the user's search intent. Return JSON only in this shape:
{"recommendations":[{"index":0,"reason":"short reason","recommended":true}]}
Mark only the best matches as recommended=true. Keep reasons concise. Recommend up to 8 items.
Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
Favor thumbnails that look directly useful for media editing and footage sourcing.
User query: ` + query,
},
}
maxImages := min(len(candidates), 10)
for idx := 0; idx < maxImages; idx++ {
img, mimeType, err := fetchImageAsInlineData(g.Client, candidates[idx].ThumbnailURL)
if err != nil {
continue
}
parts = append(parts,
geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)},
geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}},
)
}
body := map[string]any{
"contents": []map[string]any{
{"parts": parts},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
},
}
rawBody, _ := json.Marshal(body)
endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("gemini vision response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return nil, fmt.Errorf("gemini vision returned no candidates")
}
jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text)
if err != nil {
return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err)
}
var parsed struct {
Recommendations []struct {
Index int `json:"index"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
} `json:"recommendations"`
}
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200))
}
recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations))
for _, rec := range parsed.Recommendations {
if rec.Index < 0 || rec.Index >= len(candidates) || !rec.Recommended {
continue
}
src := candidates[rec.Index]
recommendations = append(recommendations, AIRecommendation{
Title: src.Title,
Link: src.Link,
ThumbnailURL: src.ThumbnailURL,
Source: src.Source,
Reason: rec.Reason,
Recommended: true,
})
}
if len(recommendations) == 0 {
for _, candidate := range candidates[:min(4, len(candidates))] {
recommendations = append(recommendations, AIRecommendation{
Title: candidate.Title,
Link: candidate.Link,
ThumbnailURL: candidate.ThumbnailURL,
Source: candidate.Source,
Reason: "Fallback result because Gemini returned no recommended items.",
Recommended: true,
})
}
}
return recommendations, nil
}
func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) {
resp, err := client.Get(imageURL)
if err != nil {
return "", "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
mimeType, _, _ := mime.ParseMediaType(contentType)
if mimeType == "" || !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), mimeType, nil
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func extractJSONObject(text string) (string, error) {
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
start := strings.Index(cleaned, "{")
if start == -1 {
return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200))
}
depth := 0
inString := false
escaped := false
for i := start; i < len(cleaned); i++ {
ch := cleaned[i]
if escaped {
escaped = false
continue
}
if ch == '\\' && inString {
escaped = true
continue
}
if ch == '"' {
inString = !inString
continue
}
if inString {
continue
}
switch ch {
case '{':
depth++
case '}':
depth--
if depth == 0 {
return cleaned[start : i+1], nil
}
}
}
return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200))
}
func truncateForError(text string, limit int) string {
trimmed := strings.TrimSpace(text)
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}