Files
ai-media-hub/backend/services/gemini.go
T
AI Assistant 4a51998cbd
build-push / docker (push) Successful in 4m12s
Improve translation fallback and scale UI
2026-03-13 12:11:38 +09:00

562 lines
15 KiB
Go

package services
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
neturl "net/url"
"strings"
"time"
)
type GeminiService struct {
APIKey string
Client *http.Client
}
type AIRecommendation struct {
Title string `json:"title"`
Link string `json:"link"`
ThumbnailURL string `json:"thumbnailUrl"`
Source string `json:"source"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
}
type QueryExpansion struct {
Querywords []string `json:"querywords"`
}
func NewGeminiService(apiKey string) *GeminiService {
return &GeminiService{
APIKey: apiKey,
Client: &http.Client{Timeout: 40 * time.Second},
}
}
func (g *GeminiService) ExpandQuery(query string) ([]string, error) {
if g.APIKey == "" {
return fallbackQueryExpansion(query, query), nil
}
englishBase := g.TranslateQuery(query)
body := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{
{
"text": "You are a JSON-only API. Output valid JSON only. Never add prose, labels, markdown, or explanations before or after the JSON.",
},
},
},
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": `Return JSON only in this shape: {"querywords":["..."]}.
Generate at most 10 concise English search variations for media discovery across Google Video, Envato, and Artgrid.
The queries must be usable directly in English search engines for stock footage discovery.
Prioritize media, video footage, stock footage, cinematic b-roll, editorial footage, and scene-based search terms.
Avoid celebrity gossip, reaction-style phrasing, clickbait phrasing, and generic web search wording.
Do not output Korean unless it is part of a proper noun.
Original user query: ` + query + `
English base translation: ` + englishBase,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
"temperature": 0.2,
"maxOutputTokens": 220,
"responseSchema": map[string]any{
"type": "OBJECT",
"properties": map[string]any{
"querywords": map[string]any{
"type": "ARRAY",
"items": map[string]any{
"type": "STRING",
},
},
},
"required": []string{"querywords"},
},
},
}
rawText, err := g.generateText(body)
if err != nil {
return fallbackQueryExpansion(query, englishBase), nil
}
jsonText, err := extractJSONObject(rawText)
if err != nil {
strictBody := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{
{
"text": "You are a strict JSON emitter. Output one valid JSON object only. Do not write any other text.",
},
},
},
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": `STRICT JSON ONLY.
Output must start with { and end with }.
Do not add prose, explanations, markdown, code fences, or labels.
Return exactly this shape: {"querywords":["..."]}.
Generate up to 10 search queries for media discovery across Google Video, Envato, and Artgrid.
Every query must be in natural English and suitable for stock-footage search.
Original user query: ` + query + `
English base translation: ` + englishBase,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
"temperature": 0.1,
"maxOutputTokens": 220,
"responseSchema": map[string]any{
"type": "OBJECT",
"properties": map[string]any{
"querywords": map[string]any{
"type": "ARRAY",
"items": map[string]any{
"type": "STRING",
},
},
},
"required": []string{"querywords"},
},
},
}
rawText, retryErr := g.generateText(strictBody)
if retryErr != nil {
return fallbackQueryExpansion(query, englishBase), nil
}
jsonText, err = extractJSONObject(rawText)
if err != nil {
return fallbackQueryExpansion(query, englishBase), nil
}
}
var parsed QueryExpansion
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
return fallbackQueryExpansion(query, englishBase), nil
}
queries := fallbackQueryExpansion(query, englishBase)
seen := map[string]bool{}
for _, existing := range queries {
seen[strings.ToLower(strings.TrimSpace(existing))] = true
}
for _, item := range parsed.Querywords {
trimmed := strings.TrimSpace(item)
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
}
return queries, nil
}
func (g *GeminiService) TranslateQuery(query string) string {
if strings.TrimSpace(query) == "" || looksMostlyASCII(query) || g.APIKey == "" {
return strings.TrimSpace(query)
}
body := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{
{
"text": "You translate media search intents into natural English. Output one plain English search phrase only. No labels, no quotes, no explanations.",
},
},
},
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": "Translate this user query into concise English suitable for stock-footage search: " + query,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "text/plain",
"temperature": 0.1,
"maxOutputTokens": 40,
},
}
rawText, err := g.generateText(body)
if err == nil {
translated := sanitizePlainEnglishLine(rawText)
if translated != "" && !strings.EqualFold(translated, strings.TrimSpace(query)) {
return translated
}
}
if translated, err := g.translateViaGoogle(query); err == nil && translated != "" {
return translated
}
return strings.TrimSpace(query)
}
func (g *GeminiService) generateText(body map[string]any) (string, error) {
rawBody, _ := json.Marshal(body)
endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return "", fmt.Errorf("gemini request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", fmt.Errorf("gemini response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return "", fmt.Errorf("gemini returned no candidates")
}
return payload.Candidates[0].Content.Parts[0].Text, nil
}
func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) {
if g.APIKey == "" {
return nil, fmt.Errorf("gemini api key is not configured")
}
if len(candidates) == 0 {
return []AIRecommendation{}, nil
}
type geminiPart map[string]any
parts := []geminiPart{
{
"text": `Analyze the provided images for the user's search intent. Return JSON only in this shape:
{"recommendations":[{"index":0,"reason":"short reason","recommended":true}]}
Mark only the best matches as recommended=true. Keep reasons concise. Recommend up to 8 items.
Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
Favor thumbnails that look directly useful for media editing and footage sourcing.
User query: ` + query,
},
}
maxImages := min(len(candidates), 10)
for idx := 0; idx < maxImages; idx++ {
img, mimeType, err := fetchImageAsInlineData(g.Client, candidates[idx].ThumbnailURL)
if err != nil {
continue
}
parts = append(parts,
geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)},
geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}},
)
}
body := map[string]any{
"contents": []map[string]any{
{"parts": parts},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
},
}
rawBody, _ := json.Marshal(body)
endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("gemini vision response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return nil, fmt.Errorf("gemini vision returned no candidates")
}
jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text)
if err != nil {
return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err)
}
var parsed struct {
Recommendations []struct {
Index int `json:"index"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
} `json:"recommendations"`
}
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200))
}
recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations))
for _, rec := range parsed.Recommendations {
if rec.Index < 0 || rec.Index >= len(candidates) || !rec.Recommended {
continue
}
src := candidates[rec.Index]
recommendations = append(recommendations, AIRecommendation{
Title: src.Title,
Link: src.Link,
ThumbnailURL: src.ThumbnailURL,
Source: src.Source,
Reason: rec.Reason,
Recommended: true,
})
}
if len(recommendations) == 0 {
for _, candidate := range candidates[:min(4, len(candidates))] {
recommendations = append(recommendations, AIRecommendation{
Title: candidate.Title,
Link: candidate.Link,
ThumbnailURL: candidate.ThumbnailURL,
Source: candidate.Source,
Reason: "Fallback result because Gemini returned no recommended items.",
Recommended: true,
})
}
}
return recommendations, nil
}
func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) {
resp, err := client.Get(imageURL)
if err != nil {
return "", "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
mimeType, _, _ := mime.ParseMediaType(contentType)
if mimeType == "" || !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), mimeType, nil
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func extractJSONObject(text string) (string, error) {
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
start := strings.Index(cleaned, "{")
if start == -1 {
return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200))
}
depth := 0
inString := false
escaped := false
for i := start; i < len(cleaned); i++ {
ch := cleaned[i]
if escaped {
escaped = false
continue
}
if ch == '\\' && inString {
escaped = true
continue
}
if ch == '"' {
inString = !inString
continue
}
if inString {
continue
}
switch ch {
case '{':
depth++
case '}':
depth--
if depth == 0 {
return cleaned[start : i+1], nil
}
}
}
return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200))
}
func truncateForError(text string, limit int) string {
trimmed := strings.TrimSpace(text)
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func fallbackQueryExpansion(originalQuery, englishQuery string) []string {
base := strings.TrimSpace(englishQuery)
if base == "" {
base = strings.TrimSpace(originalQuery)
}
candidates := []string{
base,
base + " b-roll",
base + " stock footage",
base + " cinematic footage",
base + " establishing shot",
base + " editorial footage",
base + " urban scene",
base + " ambient footage",
base + " 4k footage",
base + " cinematic b-roll",
}
seen := map[string]bool{}
queries := make([]string, 0, len(candidates))
for _, item := range candidates {
trimmed := strings.TrimSpace(item)
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
}
return queries
}
func sanitizePlainEnglishLine(text string) string {
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(strings.Trim(line, "\"'`"))
if line == "" {
continue
}
lower := strings.ToLower(line)
for _, prefix := range []string{"translation:", "english:", "translated query:"} {
if strings.HasPrefix(lower, prefix) {
line = strings.TrimSpace(line[len(prefix):])
lower = strings.ToLower(line)
}
}
if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") {
continue
}
if line != "" {
return line
}
}
return ""
}
func looksMostlyASCII(text string) bool {
ascii := 0
runes := []rune(text)
for _, r := range runes {
if r <= 127 {
ascii++
}
}
return ascii >= len(runes)*8/10
}
func (g *GeminiService) translateViaGoogle(query string) (string, error) {
endpoint := "https://translate.googleapis.com/translate_a/single?client=gtx&sl=auto&tl=en&dt=t&q=" + neturl.QueryEscape(query)
resp, err := g.Client.Get(endpoint)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode)
}
var payload []any
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", err
}
if len(payload) == 0 {
return "", fmt.Errorf("google translate fallback returned no payload")
}
top, ok := payload[0].([]any)
if !ok {
return "", fmt.Errorf("google translate fallback returned unexpected payload")
}
var builder strings.Builder
for _, part := range top {
segment, ok := part.([]any)
if !ok || len(segment) == 0 {
continue
}
if text, ok := segment[0].(string); ok {
builder.WriteString(text)
}
}
translated := strings.TrimSpace(builder.String())
if translated == "" {
return "", fmt.Errorf("google translate fallback returned empty translation")
}
return translated, nil
}