Files
ai-media-hub/backend/services/gemini.go
T
AI Assistant b43886e950
build-push / docker (push) Successful in 4m52s
Add in-app result viewer and expand Gemini review
2026-03-16 10:12:12 +09:00

626 lines
18 KiB
Go

package services
import (
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
neturl "net/url"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"time"
)
type GeminiService struct {
APIKey string
Client *http.Client
GenerateEndpoint string
TranslateEndpoint string
}
type AIRecommendation struct {
Title string `json:"title"`
Link string `json:"link"`
Snippet string `json:"snippet"`
ThumbnailURL string `json:"thumbnailUrl"`
PreviewVideoURL string `json:"previewVideoUrl"`
Source string `json:"source"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
}
type QueryExpansion struct {
Querywords []string `json:"querywords"`
}
func NewGeminiService(apiKey string) *GeminiService {
return &GeminiService{
APIKey: apiKey,
Client: &http.Client{Timeout: 40 * time.Second},
GenerateEndpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent",
TranslateEndpoint: "https://translate.googleapis.com/translate_a/single",
}
}
func (g *GeminiService) ExpandQuery(query string) ([]string, error) {
englishBase := g.TranslateQuery(query)
return buildSearchQueries(query, englishBase), nil
}
func (g *GeminiService) TranslateQuery(query string) string {
trimmed := strings.TrimSpace(query)
if trimmed == "" {
return ""
}
normalizedIntent := normalizeKnownMediaPhrases(trimmed)
if looksMostlyASCII(normalizedIntent) {
return strings.TrimSpace(normalizedIntent)
}
if looksMostlyASCII(trimmed) {
return trimmed
}
if g.APIKey != "" {
body := map[string]any{
"systemInstruction": map[string]any{
"parts": []map[string]string{
{
"text": "You translate media search intents into natural English. Output one plain English search phrase only. No labels, no quotes, no explanations.",
},
},
},
"contents": []map[string]any{
{
"parts": []map[string]string{
{
"text": "Translate this user query into concise English suitable for stock-footage search: " + trimmed,
},
},
},
},
"generationConfig": map[string]any{
"responseMimeType": "text/plain",
"temperature": 0.1,
"maxOutputTokens": 40,
},
}
rawText, err := g.generateText(body)
if err == nil {
translated := sanitizePlainEnglishLine(rawText)
if translated != "" && !strings.EqualFold(translated, trimmed) && !isOvercompressedTranslation(trimmed, translated) {
return translated
}
}
}
if translated, err := g.translateViaGoogle(trimmed); err == nil && translated != "" && isLikelyEnglishQuery(translated) && !isOvercompressedTranslation(trimmed, translated) {
return translated
}
if translated := translateKoreanMediaTerms(normalizedIntent); translated != "" && !strings.EqualFold(translated, trimmed) {
return translated
}
return strings.TrimSpace(normalizedIntent)
}
func (g *GeminiService) generateText(body map[string]any) (string, error) {
rawBody, _ := json.Marshal(body)
endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return "", fmt.Errorf("gemini request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", fmt.Errorf("gemini response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return "", fmt.Errorf("gemini returned no candidates")
}
return payload.Candidates[0].Content.Parts[0].Text, nil
}
func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) {
if g.APIKey == "" {
return nil, fmt.Errorf("gemini api key is not configured")
}
if len(candidates) == 0 {
return []AIRecommendation{}, nil
}
type geminiPart map[string]any
parts := []geminiPart{
{
"text": `Analyze the provided images for the user's search intent. Return JSON only in this shape:
{"recommendations":[{"index":0,"reason":"short reason","recommended":true}]}
Return one entry for every analyzed candidate. Use Korean for every reason. Keep reasons concise but specific enough to explain usefulness.
Mark the strongest matches as recommended=true and weaker matches as recommended=false.
Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
Favor thumbnails that look directly useful for media editing and footage sourcing.
User query: ` + query,
},
}
maxImages := min(len(candidates), 10)
for idx := 0; idx < maxImages; idx++ {
img, mimeType, err := fetchCandidateVisualInlineData(g.Client, candidates[idx])
if err != nil {
continue
}
parts = append(parts,
geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)},
geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}},
)
}
body := map[string]any{
"contents": []map[string]any{
{"parts": parts},
},
"generationConfig": map[string]any{
"responseMimeType": "application/json",
},
}
rawBody, _ := json.Marshal(body)
endpoint := strings.TrimRight(g.GenerateEndpoint, "?") + "?key=" + g.APIKey
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
}
var payload struct {
Candidates []struct {
Content struct {
Parts []struct {
Text string `json:"text"`
} `json:"parts"`
} `json:"content"`
} `json:"candidates"`
}
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return nil, fmt.Errorf("gemini vision response decode failed: %w", err)
}
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
return nil, fmt.Errorf("gemini vision returned no candidates")
}
jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text)
if err != nil {
return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err)
}
var parsed struct {
Recommendations []struct {
Index int `json:"index"`
Reason string `json:"reason"`
Recommended bool `json:"recommended"`
} `json:"recommendations"`
}
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200))
}
recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations))
for _, rec := range parsed.Recommendations {
if rec.Index < 0 || rec.Index >= len(candidates) {
continue
}
src := candidates[rec.Index]
recommendations = append(recommendations, AIRecommendation{
Title: src.Title,
Link: src.Link,
Snippet: src.Snippet,
ThumbnailURL: src.ThumbnailURL,
PreviewVideoURL: src.PreviewVideoURL,
Source: src.Source,
Reason: normalizeKoreanReason(rec.Reason),
Recommended: rec.Recommended,
})
}
if len(recommendations) == 0 {
for _, candidate := range candidates[:min(8, len(candidates))] {
recommendations = append(recommendations, AIRecommendation{
Title: candidate.Title,
Link: candidate.Link,
Snippet: candidate.Snippet,
ThumbnailURL: candidate.ThumbnailURL,
PreviewVideoURL: candidate.PreviewVideoURL,
Source: candidate.Source,
Reason: "Gemini Vision 평가를 받지 못해 키워드 기준으로 보강된 결과입니다.",
Recommended: false,
})
}
}
return recommendations, nil
}
func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) {
if strings.TrimSpace(imageURL) == "" {
return "", "", fmt.Errorf("image url is empty")
}
resp, err := client.Get(imageURL)
if err == nil {
defer resp.Body.Close()
}
if err != nil || resp.StatusCode >= 300 {
req, reqErr := newBrowserStyleImageRequest(imageURL)
if reqErr != nil {
if err != nil {
return "", "", err
}
return "", "", reqErr
}
if resp != nil {
resp.Body.Close()
}
resp, err = client.Do(req)
if err != nil {
return "", "", err
}
defer resp.Body.Close()
}
if resp.StatusCode >= 300 {
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
}
contentType := resp.Header.Get("Content-Type")
mimeType, _, _ := mime.ParseMediaType(contentType)
if mimeType == "" || !strings.HasPrefix(mimeType, "image/") {
mimeType = "image/jpeg"
}
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), mimeType, nil
}
func newBrowserStyleImageRequest(imageURL string) (*http.Request, error) {
req, err := http.NewRequest(http.MethodGet, imageURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
req.Header.Set("Accept", "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
return req, nil
}
func fetchCandidateVisualInlineData(client *http.Client, candidate SearchResult) (string, string, error) {
if candidate.ThumbnailURL != "" {
data, mimeType, err := fetchImageAsInlineData(client, candidate.ThumbnailURL)
if err == nil {
return data, mimeType, nil
}
}
if candidate.PreviewVideoURL != "" {
return extractFrameFromVideo(candidate.PreviewVideoURL)
}
return "", "", fmt.Errorf("candidate has no thumbnail or preview video")
}
func extractFrameFromVideo(videoURL string) (string, string, error) {
tempDir, err := os.MkdirTemp("", "gemini-frame-*")
if err != nil {
return "", "", err
}
defer os.RemoveAll(tempDir)
framePath := filepath.Join(tempDir, "frame.jpg")
cmd := exec.Command("ffmpeg", "-y", "-ss", "00:00:00.500", "-i", videoURL, "-frames:v", "1", "-q:v", "2", framePath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", "", fmt.Errorf("ffmpeg frame extraction failed: %s", strings.TrimSpace(string(output)))
}
data, err := os.ReadFile(framePath)
if err != nil {
return "", "", err
}
return base64.StdEncoding.EncodeToString(data), "image/jpeg", nil
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func extractJSONObject(text string) (string, error) {
cleaned := strings.TrimSpace(text)
cleaned = strings.TrimPrefix(cleaned, "```json")
cleaned = strings.TrimPrefix(cleaned, "```")
cleaned = strings.TrimSuffix(cleaned, "```")
cleaned = strings.TrimSpace(cleaned)
start := strings.Index(cleaned, "{")
if start == -1 {
return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200))
}
depth := 0
inString := false
escaped := false
for i := start; i < len(cleaned); i++ {
ch := cleaned[i]
if escaped {
escaped = false
continue
}
if ch == '\\' && inString {
escaped = true
continue
}
if ch == '"' {
inString = !inString
continue
}
if inString {
continue
}
switch ch {
case '{':
depth++
case '}':
depth--
if depth == 0 {
return cleaned[start : i+1], nil
}
}
}
return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200))
}
func truncateForError(text string, limit int) string {
trimmed := strings.TrimSpace(text)
if len(trimmed) <= limit {
return trimmed
}
return trimmed[:limit] + "..."
}
func normalizeKoreanReason(reason string) string {
trimmed := strings.TrimSpace(reason)
if trimmed == "" {
return "시각 정보가 제한적이지만 검색 의도와의 관련성을 기준으로 평가했습니다."
}
return trimmed
}
func buildSearchQueries(originalQuery, englishQuery string) []string {
base := strings.TrimSpace(englishQuery)
if base == "" {
base = strings.TrimSpace(originalQuery)
}
candidates := []string{
base,
strings.ReplaceAll(base, "pov", "point of view"),
base + " stock footage",
base + " b-roll",
base + " cinematic footage",
base + " editorial footage",
base + " establishing shot",
}
seen := map[string]bool{}
queries := make([]string, 0, len(candidates))
for _, item := range candidates {
trimmed := strings.TrimSpace(strings.Join(strings.Fields(item), " "))
if trimmed == "" {
continue
}
key := strings.ToLower(trimmed)
if seen[key] {
continue
}
seen[key] = true
queries = append(queries, trimmed)
}
return queries
}
func sanitizePlainEnglishLine(text string) string {
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimSpace(strings.Trim(line, "\"'`"))
if line == "" {
continue
}
lower := strings.ToLower(line)
for _, prefix := range []string{"translation:", "english:", "translated query:"} {
if strings.HasPrefix(lower, prefix) {
line = strings.TrimSpace(line[len(prefix):])
lower = strings.ToLower(line)
}
}
if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") {
continue
}
if line != "" {
return line
}
}
return ""
}
func looksMostlyASCII(text string) bool {
ascii := 0
runes := []rune(text)
for _, r := range runes {
if r <= 127 {
ascii++
}
}
return ascii >= len(runes)*8/10
}
func isLikelyEnglishQuery(text string) bool {
alpha := 0
nonASCII := 0
for _, r := range text {
switch {
case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z':
alpha++
case r > 127:
nonASCII++
}
}
return alpha > 0 && nonASCII == 0
}
func translateKoreanMediaTerms(query string) string {
replacements := []struct {
korean string
english string
}{
{korean: "사이버 펑크 도시", english: "cyberpunk city"},
{korean: "사이버펑크 도시", english: "cyberpunk city"},
{korean: "사이버 펑크", english: "cyberpunk"},
{korean: "사이버펑크", english: "cyberpunk"},
{korean: "네온 도시", english: "neon city"},
{korean: "미래 도시", english: "futuristic city"},
{korean: "숲속", english: "forest"},
{korean: "다정한", english: "affectionate"},
{korean: "항공샷", english: "aerial shot"},
{korean: "사람들", english: "people"},
{korean: "행복한", english: "happy"},
{korean: "커플", english: "couple"},
{korean: "연인", english: "lovers"},
{korean: "도시", english: "city"},
{korean: "야경", english: "night city"},
{korean: "거리", english: "street"},
{korean: "골목", english: "alley"},
{korean: "바다", english: "ocean"},
{korean: "해변", english: "beach"},
{korean: "노을", english: "sunset"},
{korean: "자연", english: "nature"},
{korean: "드론", english: "drone"},
{korean: "인파", english: "crowd"},
{korean: "공원", english: "park"},
{korean: "숲", english: "forest"},
{korean: "비", english: "rain"},
{korean: "눈", english: "snow"},
{korean: "산", english: "mountain"},
}
sort.SliceStable(replacements, func(i, j int) bool {
return len([]rune(replacements[i].korean)) > len([]rune(replacements[j].korean))
})
translated := strings.TrimSpace(query)
for _, replacement := range replacements {
translated = strings.ReplaceAll(translated, replacement.korean, replacement.english)
}
translated = strings.Join(strings.Fields(translated), " ")
return strings.TrimSpace(translated)
}
func normalizeKnownMediaPhrases(query string) string {
normalized := strings.TrimSpace(query)
replacements := []struct {
from string
to string
}{
{from: "사이버 펑크 도시", to: "cyberpunk city"},
{from: "사이버펑크 도시", to: "cyberpunk city"},
{from: "사이버 펑크", to: "cyberpunk"},
{from: "사이버펑크", to: "cyberpunk"},
}
for _, replacement := range replacements {
normalized = strings.ReplaceAll(normalized, replacement.from, replacement.to)
}
return strings.Join(strings.Fields(normalized), " ")
}
func isOvercompressedTranslation(original, translated string) bool {
originalWords := len(strings.Fields(strings.TrimSpace(original)))
translatedWords := len(strings.Fields(strings.TrimSpace(translated)))
if originalWords < 2 || translatedWords >= 2 {
return false
}
lower := strings.ToLower(strings.TrimSpace(translated))
for _, allow := range []string{"cyberpunk", "nightlife", "cityscape"} {
if lower == allow {
return false
}
}
return true
}
func (g *GeminiService) translateViaGoogle(query string) (string, error) {
baseURL := g.TranslateEndpoint
if strings.TrimSpace(baseURL) == "" {
baseURL = "https://translate.googleapis.com/translate_a/single"
}
endpoint := baseURL + "?client=gtx&sl=auto&tl=en&dt=t&q=" + neturl.QueryEscape(query)
resp, err := g.Client.Get(endpoint)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode)
}
var payload []any
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return "", err
}
if len(payload) == 0 {
return "", fmt.Errorf("google translate fallback returned no payload")
}
top, ok := payload[0].([]any)
if !ok {
return "", fmt.Errorf("google translate fallback returned unexpected payload")
}
var builder strings.Builder
for _, part := range top {
segment, ok := part.([]any)
if !ok || len(segment) == 0 {
continue
}
if text, ok := segment[0].(string); ok {
builder.WriteString(text)
}
}
translated := strings.TrimSpace(builder.String())
if translated == "" {
return "", fmt.Errorf("google translate fallback returned empty translation")
}
return translated, nil
}