562 lines
15 KiB
Go
562 lines
15 KiB
Go
package services
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime"
|
|
"net/http"
|
|
neturl "net/url"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type GeminiService struct {
|
|
APIKey string
|
|
Client *http.Client
|
|
}
|
|
|
|
type AIRecommendation struct {
|
|
Title string `json:"title"`
|
|
Link string `json:"link"`
|
|
ThumbnailURL string `json:"thumbnailUrl"`
|
|
Source string `json:"source"`
|
|
Reason string `json:"reason"`
|
|
Recommended bool `json:"recommended"`
|
|
}
|
|
|
|
type QueryExpansion struct {
|
|
Querywords []string `json:"querywords"`
|
|
}
|
|
|
|
func NewGeminiService(apiKey string) *GeminiService {
|
|
return &GeminiService{
|
|
APIKey: apiKey,
|
|
Client: &http.Client{Timeout: 40 * time.Second},
|
|
}
|
|
}
|
|
|
|
func (g *GeminiService) ExpandQuery(query string) ([]string, error) {
|
|
if g.APIKey == "" {
|
|
return fallbackQueryExpansion(query, query), nil
|
|
}
|
|
|
|
englishBase := g.TranslateQuery(query)
|
|
|
|
body := map[string]any{
|
|
"systemInstruction": map[string]any{
|
|
"parts": []map[string]string{
|
|
{
|
|
"text": "You are a JSON-only API. Output valid JSON only. Never add prose, labels, markdown, or explanations before or after the JSON.",
|
|
},
|
|
},
|
|
},
|
|
"contents": []map[string]any{
|
|
{
|
|
"parts": []map[string]string{
|
|
{
|
|
"text": `Return JSON only in this shape: {"querywords":["..."]}.
|
|
Generate at most 10 concise English search variations for media discovery across Google Video, Envato, and Artgrid.
|
|
The queries must be usable directly in English search engines for stock footage discovery.
|
|
Prioritize media, video footage, stock footage, cinematic b-roll, editorial footage, and scene-based search terms.
|
|
Avoid celebrity gossip, reaction-style phrasing, clickbait phrasing, and generic web search wording.
|
|
Do not output Korean unless it is part of a proper noun.
|
|
Original user query: ` + query + `
|
|
English base translation: ` + englishBase,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
"generationConfig": map[string]any{
|
|
"responseMimeType": "application/json",
|
|
"temperature": 0.2,
|
|
"maxOutputTokens": 220,
|
|
"responseSchema": map[string]any{
|
|
"type": "OBJECT",
|
|
"properties": map[string]any{
|
|
"querywords": map[string]any{
|
|
"type": "ARRAY",
|
|
"items": map[string]any{
|
|
"type": "STRING",
|
|
},
|
|
},
|
|
},
|
|
"required": []string{"querywords"},
|
|
},
|
|
},
|
|
}
|
|
|
|
rawText, err := g.generateText(body)
|
|
if err != nil {
|
|
return fallbackQueryExpansion(query, englishBase), nil
|
|
}
|
|
|
|
jsonText, err := extractJSONObject(rawText)
|
|
if err != nil {
|
|
strictBody := map[string]any{
|
|
"systemInstruction": map[string]any{
|
|
"parts": []map[string]string{
|
|
{
|
|
"text": "You are a strict JSON emitter. Output one valid JSON object only. Do not write any other text.",
|
|
},
|
|
},
|
|
},
|
|
"contents": []map[string]any{
|
|
{
|
|
"parts": []map[string]string{
|
|
{
|
|
"text": `STRICT JSON ONLY.
|
|
Output must start with { and end with }.
|
|
Do not add prose, explanations, markdown, code fences, or labels.
|
|
Return exactly this shape: {"querywords":["..."]}.
|
|
Generate up to 10 search queries for media discovery across Google Video, Envato, and Artgrid.
|
|
Every query must be in natural English and suitable for stock-footage search.
|
|
Original user query: ` + query + `
|
|
English base translation: ` + englishBase,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
"generationConfig": map[string]any{
|
|
"responseMimeType": "application/json",
|
|
"temperature": 0.1,
|
|
"maxOutputTokens": 220,
|
|
"responseSchema": map[string]any{
|
|
"type": "OBJECT",
|
|
"properties": map[string]any{
|
|
"querywords": map[string]any{
|
|
"type": "ARRAY",
|
|
"items": map[string]any{
|
|
"type": "STRING",
|
|
},
|
|
},
|
|
},
|
|
"required": []string{"querywords"},
|
|
},
|
|
},
|
|
}
|
|
rawText, retryErr := g.generateText(strictBody)
|
|
if retryErr != nil {
|
|
return fallbackQueryExpansion(query, englishBase), nil
|
|
}
|
|
jsonText, err = extractJSONObject(rawText)
|
|
if err != nil {
|
|
return fallbackQueryExpansion(query, englishBase), nil
|
|
}
|
|
}
|
|
|
|
var parsed QueryExpansion
|
|
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
|
|
return fallbackQueryExpansion(query, englishBase), nil
|
|
}
|
|
|
|
queries := fallbackQueryExpansion(query, englishBase)
|
|
seen := map[string]bool{}
|
|
for _, existing := range queries {
|
|
seen[strings.ToLower(strings.TrimSpace(existing))] = true
|
|
}
|
|
for _, item := range parsed.Querywords {
|
|
trimmed := strings.TrimSpace(item)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(trimmed)
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
queries = append(queries, trimmed)
|
|
}
|
|
return queries, nil
|
|
}
|
|
|
|
func (g *GeminiService) TranslateQuery(query string) string {
|
|
if strings.TrimSpace(query) == "" || looksMostlyASCII(query) || g.APIKey == "" {
|
|
return strings.TrimSpace(query)
|
|
}
|
|
|
|
body := map[string]any{
|
|
"systemInstruction": map[string]any{
|
|
"parts": []map[string]string{
|
|
{
|
|
"text": "You translate media search intents into natural English. Output one plain English search phrase only. No labels, no quotes, no explanations.",
|
|
},
|
|
},
|
|
},
|
|
"contents": []map[string]any{
|
|
{
|
|
"parts": []map[string]string{
|
|
{
|
|
"text": "Translate this user query into concise English suitable for stock-footage search: " + query,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
"generationConfig": map[string]any{
|
|
"responseMimeType": "text/plain",
|
|
"temperature": 0.1,
|
|
"maxOutputTokens": 40,
|
|
},
|
|
}
|
|
|
|
rawText, err := g.generateText(body)
|
|
if err == nil {
|
|
translated := sanitizePlainEnglishLine(rawText)
|
|
if translated != "" && !strings.EqualFold(translated, strings.TrimSpace(query)) {
|
|
return translated
|
|
}
|
|
}
|
|
|
|
if translated, err := g.translateViaGoogle(query); err == nil && translated != "" {
|
|
return translated
|
|
}
|
|
return strings.TrimSpace(query)
|
|
}
|
|
|
|
func (g *GeminiService) generateText(body map[string]any) (string, error) {
|
|
rawBody, _ := json.Marshal(body)
|
|
endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey
|
|
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
|
|
if err != nil {
|
|
return "", fmt.Errorf("gemini request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
|
|
return "", fmt.Errorf("gemini returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
|
|
}
|
|
|
|
var payload struct {
|
|
Candidates []struct {
|
|
Content struct {
|
|
Parts []struct {
|
|
Text string `json:"text"`
|
|
} `json:"parts"`
|
|
} `json:"content"`
|
|
} `json:"candidates"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
|
return "", fmt.Errorf("gemini response decode failed: %w", err)
|
|
}
|
|
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
|
|
return "", fmt.Errorf("gemini returned no candidates")
|
|
}
|
|
return payload.Candidates[0].Content.Parts[0].Text, nil
|
|
}
|
|
|
|
func (g *GeminiService) Recommend(query string, candidates []SearchResult) ([]AIRecommendation, error) {
|
|
if g.APIKey == "" {
|
|
return nil, fmt.Errorf("gemini api key is not configured")
|
|
}
|
|
if len(candidates) == 0 {
|
|
return []AIRecommendation{}, nil
|
|
}
|
|
|
|
type geminiPart map[string]any
|
|
parts := []geminiPart{
|
|
{
|
|
"text": `Analyze the provided images for the user's search intent. Return JSON only in this shape:
|
|
{"recommendations":[{"index":0,"reason":"short reason","recommended":true}]}
|
|
Mark only the best matches as recommended=true. Keep reasons concise. Recommend up to 8 items.
|
|
Prefer cinematic b-roll, stock footage, editorial footage, clean composition, usable establishing shots, and professional media thumbnails.
|
|
Avoid clickbait faces, exaggerated expressions, meme aesthetics, low-information thumbnails, sensational text overlays, or gossip-style imagery.
|
|
Favor thumbnails that look directly useful for media editing and footage sourcing.
|
|
User query: ` + query,
|
|
},
|
|
}
|
|
|
|
maxImages := min(len(candidates), 10)
|
|
for idx := 0; idx < maxImages; idx++ {
|
|
img, mimeType, err := fetchImageAsInlineData(g.Client, candidates[idx].ThumbnailURL)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
parts = append(parts,
|
|
geminiPart{"text": fmt.Sprintf("Candidate %d: title=%s source=%s link=%s", idx, candidates[idx].Title, candidates[idx].Source, candidates[idx].Link)},
|
|
geminiPart{"inlineData": map[string]string{"mimeType": mimeType, "data": img}},
|
|
)
|
|
}
|
|
|
|
body := map[string]any{
|
|
"contents": []map[string]any{
|
|
{"parts": parts},
|
|
},
|
|
"generationConfig": map[string]any{
|
|
"responseMimeType": "application/json",
|
|
},
|
|
}
|
|
|
|
rawBody, _ := json.Marshal(body)
|
|
endpoint := "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + g.APIKey
|
|
resp, err := g.Client.Post(endpoint, "application/json", bytes.NewReader(rawBody))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
data, _ := io.ReadAll(io.LimitReader(resp.Body, 2048))
|
|
return nil, fmt.Errorf("gemini vision returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(data)))
|
|
}
|
|
|
|
var payload struct {
|
|
Candidates []struct {
|
|
Content struct {
|
|
Parts []struct {
|
|
Text string `json:"text"`
|
|
} `json:"parts"`
|
|
} `json:"content"`
|
|
} `json:"candidates"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
|
return nil, fmt.Errorf("gemini vision response decode failed: %w", err)
|
|
}
|
|
if len(payload.Candidates) == 0 || len(payload.Candidates[0].Content.Parts) == 0 {
|
|
return nil, fmt.Errorf("gemini vision returned no candidates")
|
|
}
|
|
|
|
jsonText, err := extractJSONObject(payload.Candidates[0].Content.Parts[0].Text)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("gemini vision JSON extraction failed: %w", err)
|
|
}
|
|
|
|
var parsed struct {
|
|
Recommendations []struct {
|
|
Index int `json:"index"`
|
|
Reason string `json:"reason"`
|
|
Recommended bool `json:"recommended"`
|
|
} `json:"recommendations"`
|
|
}
|
|
if err := json.Unmarshal([]byte(jsonText), &parsed); err != nil {
|
|
return nil, fmt.Errorf("gemini vision JSON parse failed: %w; raw=%q", err, truncateForError(payload.Candidates[0].Content.Parts[0].Text, 200))
|
|
}
|
|
|
|
recommendations := make([]AIRecommendation, 0, len(parsed.Recommendations))
|
|
for _, rec := range parsed.Recommendations {
|
|
if rec.Index < 0 || rec.Index >= len(candidates) || !rec.Recommended {
|
|
continue
|
|
}
|
|
src := candidates[rec.Index]
|
|
recommendations = append(recommendations, AIRecommendation{
|
|
Title: src.Title,
|
|
Link: src.Link,
|
|
ThumbnailURL: src.ThumbnailURL,
|
|
Source: src.Source,
|
|
Reason: rec.Reason,
|
|
Recommended: true,
|
|
})
|
|
}
|
|
|
|
if len(recommendations) == 0 {
|
|
for _, candidate := range candidates[:min(4, len(candidates))] {
|
|
recommendations = append(recommendations, AIRecommendation{
|
|
Title: candidate.Title,
|
|
Link: candidate.Link,
|
|
ThumbnailURL: candidate.ThumbnailURL,
|
|
Source: candidate.Source,
|
|
Reason: "Fallback result because Gemini returned no recommended items.",
|
|
Recommended: true,
|
|
})
|
|
}
|
|
}
|
|
|
|
return recommendations, nil
|
|
}
|
|
|
|
func fetchImageAsInlineData(client *http.Client, imageURL string) (string, string, error) {
|
|
resp, err := client.Get(imageURL)
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
return "", "", fmt.Errorf("thumbnail fetch failed with %d", resp.StatusCode)
|
|
}
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
mimeType, _, _ := mime.ParseMediaType(contentType)
|
|
if mimeType == "" || !strings.HasPrefix(mimeType, "image/") {
|
|
mimeType = "image/jpeg"
|
|
}
|
|
|
|
data, err := io.ReadAll(io.LimitReader(resp.Body, 2*1024*1024))
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
return base64.StdEncoding.EncodeToString(data), mimeType, nil
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func extractJSONObject(text string) (string, error) {
|
|
cleaned := strings.TrimSpace(text)
|
|
cleaned = strings.TrimPrefix(cleaned, "```json")
|
|
cleaned = strings.TrimPrefix(cleaned, "```")
|
|
cleaned = strings.TrimSuffix(cleaned, "```")
|
|
cleaned = strings.TrimSpace(cleaned)
|
|
|
|
start := strings.Index(cleaned, "{")
|
|
if start == -1 {
|
|
return "", fmt.Errorf("no JSON object start found in %q", truncateForError(cleaned, 200))
|
|
}
|
|
|
|
depth := 0
|
|
inString := false
|
|
escaped := false
|
|
for i := start; i < len(cleaned); i++ {
|
|
ch := cleaned[i]
|
|
if escaped {
|
|
escaped = false
|
|
continue
|
|
}
|
|
if ch == '\\' && inString {
|
|
escaped = true
|
|
continue
|
|
}
|
|
if ch == '"' {
|
|
inString = !inString
|
|
continue
|
|
}
|
|
if inString {
|
|
continue
|
|
}
|
|
switch ch {
|
|
case '{':
|
|
depth++
|
|
case '}':
|
|
depth--
|
|
if depth == 0 {
|
|
return cleaned[start : i+1], nil
|
|
}
|
|
}
|
|
}
|
|
return "", fmt.Errorf("no complete JSON object found in %q", truncateForError(cleaned, 200))
|
|
}
|
|
|
|
func truncateForError(text string, limit int) string {
|
|
trimmed := strings.TrimSpace(text)
|
|
if len(trimmed) <= limit {
|
|
return trimmed
|
|
}
|
|
return trimmed[:limit] + "..."
|
|
}
|
|
|
|
func fallbackQueryExpansion(originalQuery, englishQuery string) []string {
|
|
base := strings.TrimSpace(englishQuery)
|
|
if base == "" {
|
|
base = strings.TrimSpace(originalQuery)
|
|
}
|
|
candidates := []string{
|
|
base,
|
|
base + " b-roll",
|
|
base + " stock footage",
|
|
base + " cinematic footage",
|
|
base + " establishing shot",
|
|
base + " editorial footage",
|
|
base + " urban scene",
|
|
base + " ambient footage",
|
|
base + " 4k footage",
|
|
base + " cinematic b-roll",
|
|
}
|
|
seen := map[string]bool{}
|
|
queries := make([]string, 0, len(candidates))
|
|
for _, item := range candidates {
|
|
trimmed := strings.TrimSpace(item)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(trimmed)
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
queries = append(queries, trimmed)
|
|
}
|
|
return queries
|
|
}
|
|
|
|
func sanitizePlainEnglishLine(text string) string {
|
|
lines := strings.Split(text, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(strings.Trim(line, "\"'`"))
|
|
if line == "" {
|
|
continue
|
|
}
|
|
lower := strings.ToLower(line)
|
|
for _, prefix := range []string{"translation:", "english:", "translated query:"} {
|
|
if strings.HasPrefix(lower, prefix) {
|
|
line = strings.TrimSpace(line[len(prefix):])
|
|
lower = strings.ToLower(line)
|
|
}
|
|
}
|
|
if strings.HasPrefix(lower, "here is") || strings.HasPrefix(lower, "the translation") {
|
|
continue
|
|
}
|
|
if line != "" {
|
|
return line
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func looksMostlyASCII(text string) bool {
|
|
ascii := 0
|
|
runes := []rune(text)
|
|
for _, r := range runes {
|
|
if r <= 127 {
|
|
ascii++
|
|
}
|
|
}
|
|
return ascii >= len(runes)*8/10
|
|
}
|
|
|
|
func (g *GeminiService) translateViaGoogle(query string) (string, error) {
|
|
endpoint := "https://translate.googleapis.com/translate_a/single?client=gtx&sl=auto&tl=en&dt=t&q=" + neturl.QueryEscape(query)
|
|
resp, err := g.Client.Get(endpoint)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
return "", fmt.Errorf("google translate fallback returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
var payload []any
|
|
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
|
return "", err
|
|
}
|
|
if len(payload) == 0 {
|
|
return "", fmt.Errorf("google translate fallback returned no payload")
|
|
}
|
|
top, ok := payload[0].([]any)
|
|
if !ok {
|
|
return "", fmt.Errorf("google translate fallback returned unexpected payload")
|
|
}
|
|
|
|
var builder strings.Builder
|
|
for _, part := range top {
|
|
segment, ok := part.([]any)
|
|
if !ok || len(segment) == 0 {
|
|
continue
|
|
}
|
|
if text, ok := segment[0].(string); ok {
|
|
builder.WriteString(text)
|
|
}
|
|
}
|
|
translated := strings.TrimSpace(builder.String())
|
|
if translated == "" {
|
|
return "", fmt.Errorf("google translate fallback returned empty translation")
|
|
}
|
|
return translated, nil
|
|
}
|