Files
ai-media-hub/backend/services/gemini.go
T
AI Assistant cd9b47b33e
build-push / docker (push) Successful in 4m5s
Expand backend debug logging
2026-03-16 14:53:05 +09:00

657 lines
20 KiB
Go

package services
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
neturl "net/url"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"time"
)
type GeminiService struct {
APIKey string
Client *http.Client
GenerateEndpoint string
TranslateEndpoint string
Debug func(message string, data any)
}
type AIRecommendation struct {
Title string `json:"title"`
Link string `json:"link"`
Snippet string `json:"snippet"`
ThumbnailURL string `json:"thumbnailUrl"`
PreviewVideoURL string `json:"previewVideoUrl"`
Source string `json:"source"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
}
type QueryExpansion struct {
Querywords []string `json:"querywords"`
}
func NewGeminiService(apiKey string) *GeminiService {
return &GeminiService{
APIKey: apiKey,
Client: &http.Client{Timeout: 40 * time.Second},
GenerateEndpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent",
TranslateEndpoint: "https://translate.googleapis.com/translate_a/single",
}
}
func (g *GeminiService) ExpandQuery(query string) ([]string, error) {
englishBase := g.TranslateQuery(query)
expanded := buildSearchQueries(query, englishBase)
g.debug("gemini:expand_query", map[string]any{
"original": query,
"english": englishBase,
"expanded": expanded,
})
return expanded, nil
}
func (g *GeminiService) TranslateQuery(query string) string {
trimmed := strings.TrimSpace(query)
if trimmed == "" {
return ""
}
normalizedIntent := normalizeKnownMediaPhrases(trimmed)
if looksMostlyASCII(normalizedIntent) {
return strings.TrimSpace(normalizedIntent)
}
if looksMostlyASCII(trimmed) {
return trimmed
}
if g.APIKey != "" {
g.debug("gemini:translate_attempt", map[string]any{"mode": "gemini", "query": trimmed})
body := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{
{
"text": "You are a professional video editor. Infer stronger stock-footage and scene-search wording from the user's keyword, and expand it into natural English that a professional editor would use to find usable footage. Output one plain English search phrase only. No labels, no quotes, no explanations.",
},
},
},
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": "Expand this user query into a concise but editor-grade English footage search phrase suitable for stock-footage discovery: " + trimmed,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "text/plain",
"temperature": 0.1,
"maxOutputTokens": 40,
},
}
rawText, err := g.generateText(body)
if err == nil {
translated := sanitizePlainEnglishLine(rawText)
if translated != "" && !strings.EqualFold(translated, trimmed) && !isOvercompressedTranslation(trimmed, translated) {
g.debug("gemini:translate_success", map[string]any{"mode": "gemini", "query": trimmed, "translated": translated})
return translated
}
}
if err != nil {
g.debug("gemini:translate_error", map[string]any{"mode": "gemini", "query": trimmed, "error": err.Error()})
}
}
g.debug("gemini:translate_attempt", map[string]any{"mode": "google", "query": trimmed})
if translated, err := g.translateViaGoogle(trimmed); err == nil && translated != "" && isLikelyEnglishQuery(translated) && !isOvercompressedTranslation(trimmed, translated) {
g.debug("gemini:translate_success", map[string]any{"mode": "google", "query": trimmed, "translated": translated})
return translated
}
if translated := translateKoreanMediaTerms(normalizedIntent); translated != "" && !strings.EqualFold(translated, trimmed) {
g.debug("gemini:translate_success", map[string]any{"mode": "dictionary", "query": trimmed, "translated": translated})
return translated
}
g.debug("gemini:translate_fallback_original", map[string]any{"query": trimmed, "normalized": normalizedIntent})
return strings.TrimSpace(normalizedIntent)
}
func (g *GeminiService) generateText(body map[string]any) (string, error) {
rawBody, _ := json.Marshal(body)
endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return "", fmt.Errorf("gemini request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", fmt.Errorf("gemini response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return "", fmt.Errorf("gemini returned no candidates")
}
return payload.Candidates[0].Content.Parts[0].Text, nil
}
func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) {
if g.APIKey == "" {
return nil, fmt.Errorf("gemini api key is not configured")
}
if len(candidates) == 0 {
return []AIRecommendation{}, nil
}
g.debug("gemini:vision_start", map[string]any{
"query": query,
"candidateCount": len(candidates),
})
type geminiPart map[string]any
parts := []geminiPart{
{
"text": `You are a professional video editor. Analyze whether each provided visual is suitable as a usable scene or shot for the user's requested keyword. Return JSON only in this shape:
{"recommendations":[{"index":0,"verdict":"Yes","reason":"short reason","recommended":true}]}
Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness.
Set verdict to "Yes" or "No" for every candidate. "Yes" means the scene is usable and relevant for editing against the user's keyword. "No" means it is not suitable or not relevant enough.
Set recommended=true only when verdict is "Yes". Set recommended=false when verdict is "No".
Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
Favor scenes that look directly useful for professional editing, sequencing, establishing, cutaway, or mood-building usage.
User query: ` + query,
},
}
maxImages := min(len(candidates), 10)
visualCount := 0
for idx := 0; idx < maxImages; idx++ {
img, mimeType, err := fetchCandidateVisualInlineData(g.Client, candidates[idx])
if err != nil {
g.debug("gemini:vision_candidate_visual_error", map[string]any{
"index": idx,
"link": candidates[idx].Link,
"source": candidates[idx].Source,
"error": err.Error(),
})
continue
}
visualCount++
parts = append(parts,
geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)},
geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}},
)
}
if visualCount == 0 {
return nil, fmt.Errorf("no candidate thumbnails or preview frames could be fetched for gemini vision")
}
g.debug("gemini:vision_visuals_prepared", map[string]any{
"query": query,
"visualCount": visualCount,
"maxImages": maxImages,
})
body := map[string]any{
"contents": []map[string]any{
{"parts": parts},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
},
}
rawBody, _ := json.Marshal(body)
endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("gemini vision response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return nil, fmt.Errorf("gemini vision returned no candidates")
}
jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text)
if err != nil {
return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err)
}
var parsed struct {
Recommendations []struct {
Index int `json:"index"`
Verdict string `json:"verdict"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
} `json:"recommendations"`
}
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200))
}
recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations))
for _, rec := range parsed.Recommendations {
if rec.Index < 0 || rec.Index >= len(candidates) {
continue
}
src := candidates[rec.Index]
recommended := rec.Recommended || strings.EqualFold(strings.TrimSpace(rec.Verdict), "yes")
recommendations = append(recommendations, AIRecommendation{
Title: src.Title,
Link: src.Link,
Snippet: src.Snippet,
ThumbnailURL: src.ThumbnailURL,
PreviewVideoURL: src.PreviewVideoURL,
Source: src.Source,
Reason: normalizeKoreanReason(rec.Reason),
Recommended: recommended,
})
}
g.debug("gemini:vision_complete", map[string]any{
"query": query,
"recommendationCount": len(recommendations),
})
return recommendations, nil
}
func (g *GeminiService) debug(message string, data any) {
if g != nil && g.Debug != nil {
g.Debug(message, data)
}
}
func fetchImageAsInlineData(client *http.Client, imageURL, referer string) (string, string, error) {
if strings.TrimSpace(imageURL) == "" {
return "", "", fmt.Errorf("image url is empty")
}
req, reqErr := newBrowserStyleImageRequest(imageURL, referer)
if reqErr != nil {
return "", "", reqErr
}
resp, err := client.Do(req)
if err != nil {
return "", "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
mimeType, _, _ := mime.ParseMediaType(contentType)
if mimeType == "" || !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), mimeType, nil
}
func newBrowserStyleImageRequest(imageURL, referer string) (*http.Request, error) {
req, err := http.NewRequest(http.MethodGet, imageURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if strings.TrimSpace(referer) != "" {
req.Header.Set("Referer", referer)
}
return req, nil
}
func fetchCandidateVisualInlineData(client *http.Client, candidate SearchResult) (string, string, error) {
if candidate.PreviewVideoURL != "" && (candidate.Source == "Envato" || candidate.Source == "Artgrid") {
data, mimeType, err := extractFrameFromVideo(candidate.PreviewVideoURL)
if err == nil {
return data, mimeType, nil
}
}
if candidate.ThumbnailURL != "" {
data, mimeType, err := fetchImageAsInlineData(client, candidate.ThumbnailURL, candidate.Link)
if err == nil {
return data, mimeType, nil
}
}
if candidate.PreviewVideoURL != "" {
return extractFrameFromVideo(candidate.PreviewVideoURL)
}
return "", "", fmt.Errorf("candidate has no thumbnail or preview video")
}
func extractFrameFromVideo(videoURL string) (string, string, error) {
tempDir, err := os.MkdirTemp("", "gemini-frame-*")
if err != nil {
return "", "", err
}
defer os.RemoveAll(tempDir)
framePath := filepath.Join(tempDir, "frame.jpg")
cmd := exec.Command("ffmpeg", "-y", "-ss", "00:00:00.500", "-i", videoURL, "-frames:v", "1", "-q:v", "2", framePath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", "", fmt.Errorf("ffmpeg frame extraction failed: %s", strings.TrimSpace(string(output)))
}
data, err := os.ReadFile(framePath)
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), "image/jpeg", nil
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func extractJSONObject(text string) (string, error) {
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
start := strings.Index(cleaned, "{")
if start == -1 {
return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200))
}
depth := 0
inString := false
escaped := false
for i := start; i < len(cleaned); i++ {
ch := cleaned[i]
if escaped {
escaped = false
continue
}
if ch == '\\' && inString {
escaped = true
continue
}
if ch == '"' {
inString = !inString
continue
}
if inString {
continue
}
switch ch {
case '{':
depth++
case '}':
depth--
if depth == 0 {
return cleaned[start : i+1], nil
}
}
}
return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200))
}
func truncateForError(text string, limit int) string {
trimmed := strings.TrimSpace(text)
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func normalizeKoreanReason(reason string) string {
trimmed := strings.TrimSpace(reason)
if trimmed == "" {
return "시각 정보가 제한적이지만 검색 의도와의 관련성을 기준으로 평가했습니다."
}
return trimmed
}
func buildSearchQueries(originalQuery, englishQuery string) []string {
base := strings.TrimSpace(englishQuery)
if base == "" {
base = strings.TrimSpace(originalQuery)
}
candidates := []string{
base,
strings.ReplaceAll(base, "pov", "point of view"),
base + " stock footage",
base + " b-roll",
base + " cinematic footage",
base + " editorial footage",
base + " establishing shot",
}
seen := map[string]bool{}
queries := make([]string, 0, len(candidates))
for _, item := range candidates {
trimmed := strings.TrimSpace(strings.Join(strings.Fields(item), " "))
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
}
return queries
}
func sanitizePlainEnglishLine(text string) string {
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(strings.Trim(line, "\"'`"))
if line == "" {
continue
}
lower := strings.ToLower(line)
for _, prefix := range []string{"translation:", "english:", "translated query:"} {
if strings.HasPrefix(lower, prefix) {
line = strings.TrimSpace(line[len(prefix):])
lower = strings.ToLower(line)
}
}
if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") {
continue
}
if line != "" {
return line
}
}
return ""
}
func looksMostlyASCII(text string) bool {
ascii := 0
runes := []rune(text)
for _, r := range runes {
if r <= 127 {
ascii++
}
}
return ascii >= len(runes)*8/10
}
func isLikelyEnglishQuery(text string) bool {
alpha := 0
nonASCII := 0
for _, r := range text {
switch {
case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z':
alpha++
case r > 127:
nonASCII++
}
}
return alpha > 0 && nonASCII == 0
}
func translateKoreanMediaTerms(query string) string {
replacements := []struct {
korean string
english string
}{
{korean: "사이버 펑크 도시", english: "cyberpunk city"},
{korean: "사이버펑크 도시", english: "cyberpunk city"},
{korean: "사이버 펑크", english: "cyberpunk"},
{korean: "사이버펑크", english: "cyberpunk"},
{korean: "네온 도시", english: "neon city"},
{korean: "미래 도시", english: "futuristic city"},
{korean: "숲속", english: "forest"},
{korean: "다정한", english: "affectionate"},
{korean: "항공샷", english: "aerial shot"},
{korean: "사람들", english: "people"},
{korean: "행복한", english: "happy"},
{korean: "커플", english: "couple"},
{korean: "연인", english: "lovers"},
{korean: "도시", english: "city"},
{korean: "야경", english: "night city"},
{korean: "거리", english: "street"},
{korean: "골목", english: "alley"},
{korean: "바다", english: "ocean"},
{korean: "해변", english: "beach"},
{korean: "노을", english: "sunset"},
{korean: "자연", english: "nature"},
{korean: "드론", english: "drone"},
{korean: "인파", english: "crowd"},
{korean: "공원", english: "park"},
{korean: "숲", english: "forest"},
{korean: "비", english: "rain"},
{korean: "눈", english: "snow"},
{korean: "산", english: "mountain"},
}
sort.SliceStable(replacements, func(i, j int) bool {
return len([]rune(replacements[i].korean)) > len([]rune(replacements[j].korean))
})
translated := strings.TrimSpace(query)
for _, replacement := range replacements {
translated = strings.ReplaceAll(translated, replacement.korean, replacement.english)
}
translated = strings.Join(strings.Fields(translated), " ")
return strings.TrimSpace(translated)
}
func normalizeKnownMediaPhrases(query string) string {
normalized := strings.TrimSpace(query)
replacements := []struct {
from string
to string
}{
{from: "사이버 펑크 도시", to: "cyberpunk city"},
{from: "사이버펑크 도시", to: "cyberpunk city"},
{from: "사이버 펑크", to: "cyberpunk"},
{from: "사이버펑크", to: "cyberpunk"},
}
for _, replacement := range replacements {
normalized = strings.ReplaceAll(normalized, replacement.from, replacement.to)
}
return strings.Join(strings.Fields(normalized), " ")
}
func isOvercompressedTranslation(original, translated string) bool {
originalWords := len(strings.Fields(strings.TrimSpace(original)))
translatedWords := len(strings.Fields(strings.TrimSpace(translated)))
if originalWords < 2 || translatedWords >= 2 {
return false
}
lower := strings.ToLower(strings.TrimSpace(translated))
for _, allow := range []string{"cyberpunk", "nightlife", "cityscape"} {
if lower == allow {
return false
}
}
return true
}
func (g *GeminiService) translateViaGoogle(query string) (string, error) {
baseURL := g.TranslateEndpoint
if strings.TrimSpace(baseURL) == "" {
baseURL = "https://translate.googleapis.com/translate_a/single"
}
endpoint := baseURL + "?client=gtx&sl=auto&tl=en&dt=t&q=" + neturl.QueryEscape(query)
resp, err := g.Client.Get(endpoint)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode)
}
var payload []any
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", err
}
if len(payload) == 0 {
return "", fmt.Errorf("google translate fallback returned no payload")
}
top, ok := payload[0].([]any)
if !ok {
return "", fmt.Errorf("google translate fallback returned unexpected payload")
}
var builder strings.Builder
for _, part := range top {
segment, ok := part.([]any)
if !ok || len(segment) == 0 {
continue
}
if text, ok := segment[0].(string); ok {
builder.WriteString(text)
}
}
translated := strings.TrimSpace(builder.String())
if translated == "" {
return "", fmt.Errorf("google translate fallback returned empty translation")
}
return translated, nil
}