738 lines
20 KiB
Go
738 lines
20 KiB
Go
package services
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"os/exec"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
type SearchResult struct {
|
|
Title string `json:"title"`
|
|
Link string `json:"link"`
|
|
DisplayLink string `json:"displayLink"`
|
|
Snippet string `json:"snippet"`
|
|
ThumbnailURL string `json:"thumbnailUrl"`
|
|
PreviewVideoURL string `json:"previewVideoUrl"`
|
|
Source string `json:"source"`
|
|
}
|
|
|
|
type SearchService struct {
|
|
BaseURL string
|
|
GoogleVideoEngine string
|
|
WebEngine string
|
|
Client *http.Client
|
|
}
|
|
|
|
func NewSearchService(baseURL, googleVideoEngine, webEngine string) *SearchService {
|
|
if googleVideoEngine == "" {
|
|
googleVideoEngine = "google videos"
|
|
}
|
|
if webEngine == "" {
|
|
webEngine = "google"
|
|
}
|
|
return &SearchService{
|
|
BaseURL: strings.TrimRight(baseURL, "/"),
|
|
GoogleVideoEngine: googleVideoEngine,
|
|
WebEngine: webEngine,
|
|
Client: &http.Client{Timeout: 20 * time.Second},
|
|
}
|
|
}
|
|
|
|
func (s *SearchService) SearchMedia(queries []string, enabledPlatforms map[string]bool) ([]SearchResult, error) {
|
|
if s.BaseURL == "" {
|
|
return nil, fmt.Errorf("searxng base url is not configured")
|
|
}
|
|
|
|
type sourceConfig struct {
|
|
name string
|
|
categories string
|
|
engine string
|
|
maxResults int
|
|
build func(string) []string
|
|
accept func(SearchResult) bool
|
|
}
|
|
|
|
sources := []sourceConfig{
|
|
{
|
|
name: "Envato",
|
|
categories: "general",
|
|
engine: s.WebEngine,
|
|
maxResults: 8,
|
|
build: buildEnvatoQueries,
|
|
accept: isRenderableEnvatoResult,
|
|
},
|
|
{
|
|
name: "Artgrid",
|
|
categories: "general",
|
|
engine: s.WebEngine,
|
|
maxResults: 8,
|
|
build: buildArtgridQueries,
|
|
accept: isRenderableArtgridResult,
|
|
},
|
|
{
|
|
name: "Google Video",
|
|
categories: "videos",
|
|
engine: s.GoogleVideoEngine,
|
|
maxResults: 6,
|
|
build: buildGoogleVideoQueries,
|
|
accept: isUsefulGoogleVideoResult,
|
|
},
|
|
}
|
|
|
|
seen := map[string]bool{}
|
|
sourceCounts := map[string]int{}
|
|
results := make([]SearchResult, 0, 90)
|
|
var lastErr error
|
|
|
|
baseQueries := limitQueries(queries, 6)
|
|
primaryQueries := baseQueries[:minInt(len(baseQueries), 3)]
|
|
runSearchPass := func(bases []string, onlyMissing bool) {
|
|
for _, base := range bases {
|
|
base = strings.TrimSpace(base)
|
|
if base == "" {
|
|
continue
|
|
}
|
|
for _, source := range sources {
|
|
if len(enabledPlatforms) > 0 && !enabledPlatforms[strings.ToLower(source.name)] {
|
|
continue
|
|
}
|
|
if sourceCounts[source.name] >= source.maxResults {
|
|
continue
|
|
}
|
|
if onlyMissing && sourceCounts[source.name] > 0 {
|
|
continue
|
|
}
|
|
for _, searchQuery := range source.build(base) {
|
|
if sourceCounts[source.name] >= source.maxResults {
|
|
break
|
|
}
|
|
items, err := s.search(searchQuery, source.categories, source.engine, source.name)
|
|
if err != nil {
|
|
lastErr = err
|
|
items, err = s.search(searchQuery, source.categories, "", source.name)
|
|
}
|
|
if err != nil {
|
|
lastErr = err
|
|
continue
|
|
}
|
|
for _, item := range items {
|
|
if item.Link == "" || seen[item.Link] || !source.accept(item) {
|
|
continue
|
|
}
|
|
seen[item.Link] = true
|
|
results = append(results, item)
|
|
sourceCounts[source.name]++
|
|
if sourceCounts[source.name] >= source.maxResults {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
runSearchPass(primaryQueries, false)
|
|
if len(baseQueries) > len(primaryQueries) {
|
|
runSearchPass(baseQueries[len(primaryQueries):], true)
|
|
}
|
|
|
|
if len(results) == 0 && lastErr != nil {
|
|
return nil, lastErr
|
|
}
|
|
|
|
sort.SliceStable(results, func(i, j int) bool {
|
|
return sourceWeight(results[i].Source) > sourceWeight(results[j].Source)
|
|
})
|
|
return s.EnrichResults(results), nil
|
|
}
|
|
|
|
func (s *SearchService) EnrichResults(results []SearchResult) []SearchResult {
|
|
limit := minInt(len(results), 18)
|
|
if limit == 0 {
|
|
return results
|
|
}
|
|
|
|
enriched := make([]SearchResult, len(results))
|
|
copy(enriched, results)
|
|
|
|
var wg sync.WaitGroup
|
|
sem := make(chan struct{}, 4)
|
|
for idx := 0; idx < limit; idx++ {
|
|
wg.Add(1)
|
|
go func(i int) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
enriched[i] = s.enrichResult(enriched[i])
|
|
}(idx)
|
|
}
|
|
wg.Wait()
|
|
return enriched
|
|
}
|
|
|
|
func (s *SearchService) enrichResult(result SearchResult) SearchResult {
|
|
switch result.Source {
|
|
case "Envato":
|
|
return s.enrichEnvato(result)
|
|
case "Artgrid":
|
|
return s.enrichArtgrid(result)
|
|
default:
|
|
if result.ThumbnailURL == "" {
|
|
result.ThumbnailURL = deriveThumbnail(result.Link)
|
|
}
|
|
return result
|
|
}
|
|
}
|
|
|
|
func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
|
|
html, err := s.fetchText(result.Link)
|
|
if err != nil {
|
|
return result
|
|
}
|
|
result.Title = firstNonEmpty(
|
|
extractMetaContent(html, "og:title"),
|
|
result.Title,
|
|
)
|
|
result.Snippet = firstNonEmpty(
|
|
extractMetaContent(html, "og:description"),
|
|
extractMetaContent(html, "description"),
|
|
result.Snippet,
|
|
)
|
|
|
|
pageThumbnail := firstNonEmpty(
|
|
extractMetaContent(html, "og:image"),
|
|
extractMetaContent(html, "twitter:image"),
|
|
extractJSONLDValue(html, "thumbnailUrl"),
|
|
)
|
|
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
|
|
result.ThumbnailURL = pageThumbnail
|
|
}
|
|
if result.PreviewVideoURL == "" {
|
|
result.PreviewVideoURL = firstNonEmpty(
|
|
extractJSONLDValue(html, "contentUrl"),
|
|
extractMetaContent(html, "twitter:player:stream"),
|
|
extractVideoPreviewURL(html),
|
|
deriveEnvatoPreviewFromThumbnail(pageThumbnail),
|
|
deriveEnvatoPreviewFromThumbnail(result.ThumbnailURL),
|
|
)
|
|
}
|
|
return result
|
|
}
|
|
|
|
func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
|
|
clipID := extractArtgridClipID(result.Link)
|
|
if clipID == "" {
|
|
return result
|
|
}
|
|
|
|
apiURL := "https://artgrid.io/api/clip/details?clipId=" + clipID
|
|
body, err := s.fetchJSONText(apiURL)
|
|
if err == nil {
|
|
urls := collectURLs(body)
|
|
if result.ThumbnailURL == "" {
|
|
result.ThumbnailURL = pickImageURL(urls)
|
|
}
|
|
if result.PreviewVideoURL == "" {
|
|
result.PreviewVideoURL = pickVideoURL(urls)
|
|
}
|
|
}
|
|
|
|
if result.ThumbnailURL == "" || result.PreviewVideoURL == "" {
|
|
html, err := s.fetchText(result.Link)
|
|
if err == nil {
|
|
result.Title = firstNonEmpty(
|
|
extractMetaContent(html, "og:title"),
|
|
result.Title,
|
|
)
|
|
result.Snippet = firstNonEmpty(
|
|
extractMetaContent(html, "og:description"),
|
|
extractMetaContent(html, "description"),
|
|
result.Snippet,
|
|
)
|
|
pageThumbnail := firstNonEmpty(
|
|
extractMetaContent(html, "og:image"),
|
|
extractMetaContent(html, "twitter:image"),
|
|
extractArtgridBackgroundThumbnail(html, clipID),
|
|
extractJSONLDValue(html, "image"),
|
|
)
|
|
if shouldPreferPageThumbnail(result.ThumbnailURL, result.Link) {
|
|
result.ThumbnailURL = pageThumbnail
|
|
}
|
|
if result.PreviewVideoURL == "" {
|
|
result.PreviewVideoURL = firstNonEmpty(
|
|
extractJSONLDValue(html, "contentUrl"),
|
|
extractMetaContent(html, "twitter:player:stream"),
|
|
extractVideoPreviewURL(html),
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func (s *SearchService) search(query, categories, engine, source string) ([]SearchResult, error) {
|
|
values := url.Values{}
|
|
values.Set("q", query)
|
|
values.Set("format", "json")
|
|
values.Set("safesearch", "0")
|
|
values.Set("language", "en-US")
|
|
if categories != "" {
|
|
values.Set("categories", categories)
|
|
}
|
|
if engine != "" {
|
|
values.Set("engines", engine)
|
|
}
|
|
|
|
endpoint := s.BaseURL + "/search?" + values.Encode()
|
|
resp, err := s.Client.Get(endpoint)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
return nil, fmt.Errorf("searxng returned status %d for query %q", resp.StatusCode, query)
|
|
}
|
|
|
|
var payload struct {
|
|
Results []struct {
|
|
Title string `json:"title"`
|
|
URL string `json:"url"`
|
|
Content string `json:"content"`
|
|
Thumbnail string `json:"thumbnail"`
|
|
ThumbnailSrc string `json:"thumbnail_src"`
|
|
ImgSrc string `json:"img_src"`
|
|
ParsedURL []any `json:"parsed_url"`
|
|
Engine string `json:"engine"`
|
|
} `json:"results"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
|
return nil, fmt.Errorf("searxng JSON decode failed for query %q: %w", query, err)
|
|
}
|
|
|
|
results := make([]SearchResult, 0, len(payload.Results))
|
|
for _, item := range payload.Results {
|
|
link := strings.TrimSpace(item.URL)
|
|
if link == "" {
|
|
continue
|
|
}
|
|
results = append(results, SearchResult{
|
|
Title: item.Title,
|
|
Link: link,
|
|
DisplayLink: inferDisplayLink(link, item.ParsedURL),
|
|
Snippet: item.Content,
|
|
ThumbnailURL: firstNonEmpty(item.Thumbnail, item.ThumbnailSrc, item.ImgSrc, deriveThumbnail(link)),
|
|
Source: normalizeSource(source, link, item.Engine),
|
|
})
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
func buildGoogleVideoQueries(base string) []string {
|
|
return []string{
|
|
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR "establishing shot" OR editorial) -tutorial -"how to" -review -reaction -course -podcast -vlog -interview -breakdown -edit -editing`, base),
|
|
}
|
|
}
|
|
|
|
func buildEnvatoQueries(base string) []string {
|
|
return []string{
|
|
fmt.Sprintf(`"%s" ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com`, base),
|
|
fmt.Sprintf(`"%s" ("stock footage" OR "stock video" OR "b-roll" OR cinematic) site:elements.envato.com/stock-video`, base),
|
|
}
|
|
}
|
|
|
|
func buildArtgridQueries(base string) []string {
|
|
return []string{
|
|
fmt.Sprintf(`"%s" ("stock footage" OR "b-roll" OR cinematic OR editorial) site:artgrid.io/clip/`, base),
|
|
fmt.Sprintf(`"%s" ("footage" OR "cinematic" OR "establishing shot") site:artgrid.io/clip/`, base),
|
|
}
|
|
}
|
|
|
|
func isUsefulGoogleVideoResult(result SearchResult) bool {
|
|
lowerLink := strings.ToLower(result.Link)
|
|
if !(strings.Contains(lowerLink, "youtube.com/watch") || strings.Contains(lowerLink, "youtu.be/") || strings.Contains(lowerLink, "youtube.com/shorts/")) {
|
|
return false
|
|
}
|
|
text := strings.ToLower(result.Title + " " + result.Snippet)
|
|
for _, banned := range []string{
|
|
"tutorial", "how to", "review", "reaction", "podcast", "interview", "walkthrough",
|
|
"course", "lesson", "edit tutorial", "editing tutorial", "premiere pro", "after effects",
|
|
"breakdown", "explained", "vlog", "tips", "guide", "learn", "free download",
|
|
"bgm", "music", "song", "lyrics", "audio", "soundtrack", "trailer", "teaser",
|
|
"full movie", "movie clip", "status", "whatsapp status", "fan cam", "fancam",
|
|
} {
|
|
if strings.Contains(text, banned) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func isRenderableEnvatoResult(result SearchResult) bool {
|
|
parsed, err := url.Parse(result.Link)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
host := strings.ToLower(parsed.Host)
|
|
path := strings.Trim(parsed.Path, "/")
|
|
if strings.Contains(host, "elements.envato.com") {
|
|
if path == "" || strings.Contains(path, "/stock-video") || strings.Contains(path, "/video-templates") {
|
|
return false
|
|
}
|
|
return regexp.MustCompile(`-[A-Z0-9]{6,}$`).MatchString(path)
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isRenderableArtgridResult(result SearchResult) bool {
|
|
parsed, err := url.Parse(result.Link)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
if !strings.Contains(strings.ToLower(parsed.Host), "artgrid.io") {
|
|
return false
|
|
}
|
|
return regexp.MustCompile(`^/clip/[0-9]+/`).MatchString(parsed.Path)
|
|
}
|
|
|
|
func normalizeSource(source, link, engine string) string {
|
|
switch {
|
|
case source != "":
|
|
return source
|
|
case strings.Contains(strings.ToLower(link), "envato") || strings.Contains(strings.ToLower(link), "videohive"):
|
|
return "Envato"
|
|
case strings.Contains(strings.ToLower(link), "artgrid"):
|
|
return "Artgrid"
|
|
case strings.Contains(strings.ToLower(engine), "google"):
|
|
return "Google Video"
|
|
default:
|
|
return engine
|
|
}
|
|
}
|
|
|
|
func inferDisplayLink(link string, parsed []any) string {
|
|
if len(parsed) > 1 {
|
|
if host, ok := parsed[1].(string); ok {
|
|
return host
|
|
}
|
|
}
|
|
if parsedURL, err := url.Parse(link); err == nil {
|
|
return parsedURL.Host
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func deriveThumbnail(link string) string {
|
|
if videoID := extractYouTubeID(link); videoID != "" {
|
|
return "https://i.ytimg.com/vi/" + videoID + "/hqdefault.jpg"
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractYouTubeID(link string) string {
|
|
patterns := []*regexp.Regexp{
|
|
regexp.MustCompile(`(?:v=|\/shorts\/|\/embed\/)([A-Za-z0-9_-]{11})`),
|
|
regexp.MustCompile(`youtu\.be\/([A-Za-z0-9_-]{11})`),
|
|
}
|
|
for _, pattern := range patterns {
|
|
matches := pattern.FindStringSubmatch(link)
|
|
if len(matches) == 2 {
|
|
return matches[1]
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractMetaContent(html, property string) string {
|
|
patterns := []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)<meta[^>]+property=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
|
|
regexp.MustCompile(`(?i)<meta[^>]+name=["']` + regexp.QuoteMeta(property) + `["'][^>]+content=["']([^"']+)`),
|
|
}
|
|
for _, pattern := range patterns {
|
|
matches := pattern.FindStringSubmatch(html)
|
|
if len(matches) == 2 {
|
|
return htmlUnescape(matches[1])
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractVideoPreviewURL(html string) string {
|
|
pattern := regexp.MustCompile(`https?:\\?/\\?/[^"'\\s>]+(?:mp4|m3u8)`)
|
|
matches := pattern.FindAllString(html, -1)
|
|
for _, match := range matches {
|
|
candidate := strings.ReplaceAll(match, `\/`, `/`)
|
|
candidate = strings.ReplaceAll(candidate, `\u002F`, `/`)
|
|
candidate = strings.ReplaceAll(candidate, `\\`, "")
|
|
if strings.Contains(strings.ToLower(candidate), "preview") || strings.Contains(strings.ToLower(candidate), "video") || strings.Contains(strings.ToLower(candidate), "watermark") {
|
|
return candidate
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractArtgridBackgroundThumbnail(html, clipID string) string {
|
|
pattern := regexp.MustCompile(`https://[^"'\\s>]+(?:artgrid\.imgix\.net|cms-public-artifacts\.artlist\.io|artlist-content-images\.imgix\.net)[^"'\\s>]+(?:jpeg|jpg|png|webp)`)
|
|
matches := pattern.FindAllString(html, -1)
|
|
for _, match := range matches {
|
|
if strings.Contains(match, clipID) || strings.Contains(strings.ToLower(match), "graded-thumbnail") {
|
|
return match
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func extractArtgridClipID(link string) string {
|
|
matches := regexp.MustCompile(`/clip/([0-9]+)/`).FindStringSubmatch(link)
|
|
if len(matches) == 2 {
|
|
return matches[1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func collectURLs(body string) []string {
|
|
pattern := regexp.MustCompile(`https?:\/\/[^"'\\\s]+`)
|
|
matches := pattern.FindAllString(body, -1)
|
|
seen := map[string]bool{}
|
|
results := make([]string, 0, len(matches))
|
|
for _, match := range matches {
|
|
candidate := strings.TrimSpace(strings.Trim(match, `"'`))
|
|
if candidate == "" || seen[candidate] {
|
|
continue
|
|
}
|
|
seen[candidate] = true
|
|
results = append(results, candidate)
|
|
}
|
|
return results
|
|
}
|
|
|
|
func pickImageURL(urls []string) string {
|
|
for _, item := range urls {
|
|
lower := strings.ToLower(item)
|
|
if strings.Contains(lower, ".jpg") || strings.Contains(lower, ".jpeg") || strings.Contains(lower, ".png") || strings.Contains(lower, ".webp") {
|
|
return item
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func pickVideoURL(urls []string) string {
|
|
for _, item := range urls {
|
|
lower := strings.ToLower(item)
|
|
if strings.Contains(lower, ".mp4") || strings.Contains(lower, ".m3u8") {
|
|
return item
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func (s *SearchService) fetchText(target string) (string, error) {
|
|
req, err := newBrowserRequest(http.MethodGet, target, "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
resp, err := s.Client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusServiceUnavailable {
|
|
return fetchTextViaPython(target)
|
|
}
|
|
if resp.StatusCode >= 300 {
|
|
return "", fmt.Errorf("fetch returned status %d", resp.StatusCode)
|
|
}
|
|
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if looksLikeCloudflareChallenge(string(data)) {
|
|
return fetchTextViaPython(target)
|
|
}
|
|
return string(data), nil
|
|
}
|
|
|
|
func (s *SearchService) fetchJSONText(target string) (string, error) {
|
|
req, err := newBrowserRequest(http.MethodGet, target, "application/json, text/json, */*")
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
resp, err := s.Client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode >= 300 {
|
|
return "", fmt.Errorf("json fetch returned status %d", resp.StatusCode)
|
|
}
|
|
data, err := io.ReadAll(io.LimitReader(resp.Body, 1024*1024))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(data), nil
|
|
}
|
|
|
|
func firstNonEmpty(values ...string) string {
|
|
for _, value := range values {
|
|
if strings.TrimSpace(value) != "" {
|
|
return value
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func shouldPreferPageThumbnail(current, pageLink string) bool {
|
|
current = strings.TrimSpace(current)
|
|
if current == "" {
|
|
return true
|
|
}
|
|
lower := strings.ToLower(current)
|
|
if strings.Contains(lower, "imgs.search.brave.com") || strings.Contains(lower, "googleusercontent.com") || strings.Contains(lower, "bing.com") {
|
|
return true
|
|
}
|
|
currentHost := hostOf(current)
|
|
pageHost := hostOf(pageLink)
|
|
return currentHost == "" || (pageHost != "" && currentHost != pageHost)
|
|
}
|
|
|
|
func hostOf(raw string) string {
|
|
parsed, err := url.Parse(raw)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return strings.ToLower(parsed.Host)
|
|
}
|
|
|
|
func extractJSONLDValue(html, key string) string {
|
|
pattern := regexp.MustCompile(`"` + regexp.QuoteMeta(key) + `"\s*:\s*"(https?:\\?/\\?/[^"]+|[^"]+)"`)
|
|
matches := pattern.FindAllStringSubmatch(html, -1)
|
|
for _, match := range matches {
|
|
if len(match) != 2 {
|
|
continue
|
|
}
|
|
value := strings.ReplaceAll(match[1], `\/`, `/`)
|
|
value = strings.ReplaceAll(value, `\u002F`, `/`)
|
|
value = strings.ReplaceAll(value, `\\`, "")
|
|
value = htmlUnescape(value)
|
|
if strings.TrimSpace(value) != "" {
|
|
return value
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
|
|
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
|
|
if candidate == "" {
|
|
return ""
|
|
}
|
|
candidate = strings.ReplaceAll(candidate, "&", "&")
|
|
if strings.Contains(candidate, "/video_preview/") {
|
|
if idx := strings.Index(candidate, "?"); idx >= 0 {
|
|
candidate = candidate[:idx]
|
|
}
|
|
return regexp.MustCompile(`/video_preview/[^/]+\.(?:jpg|jpeg|png|webp)$`).ReplaceAllString(candidate, `/watermarked_preview/watermarked_preview.mp4`)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func newBrowserRequest(method, target, accept string) (*http.Request, error) {
|
|
req, err := http.NewRequest(method, target, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
|
if accept != "" {
|
|
req.Header.Set("Accept", accept)
|
|
}
|
|
return req, nil
|
|
}
|
|
|
|
func fetchTextViaPython(target string) (string, error) {
|
|
script := `
|
|
from urllib.request import Request, urlopen
|
|
import sys
|
|
req = Request(sys.argv[1], headers={
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
})
|
|
with urlopen(req, timeout=20) as resp:
|
|
sys.stdout.buffer.write(resp.read(1024 * 1024))
|
|
`
|
|
output, err := exec.Command("python3", "-c", script, target).CombinedOutput()
|
|
if err != nil {
|
|
return "", fmt.Errorf("python fallback failed: %v: %s", err, truncateBytes(output, 300))
|
|
}
|
|
return string(output), nil
|
|
}
|
|
|
|
func looksLikeCloudflareChallenge(body string) bool {
|
|
lower := strings.ToLower(body)
|
|
return strings.Contains(lower, "cf-mitigated") || strings.Contains(lower, "attention required") || strings.Contains(lower, "just a moment")
|
|
}
|
|
|
|
func truncateBytes(data []byte, limit int) string {
|
|
trimmed := strings.TrimSpace(string(data))
|
|
if len(trimmed) <= limit {
|
|
return trimmed
|
|
}
|
|
return trimmed[:limit] + "..."
|
|
}
|
|
|
|
func limitQueries(queries []string, limit int) []string {
|
|
seen := map[string]bool{}
|
|
filtered := make([]string, 0, minInt(len(queries), limit))
|
|
for _, item := range queries {
|
|
trimmed := strings.TrimSpace(item)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(trimmed)
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
filtered = append(filtered, trimmed)
|
|
if len(filtered) >= limit {
|
|
break
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func htmlUnescape(text string) string {
|
|
replacer := strings.NewReplacer("&", "&", """, `"`, "'", "'", "<", "<", ">", ">")
|
|
return replacer.Replace(text)
|
|
}
|
|
|
|
func sourceWeight(source string) int {
|
|
switch source {
|
|
case "Envato":
|
|
return 3
|
|
case "Artgrid":
|
|
return 2
|
|
case "Google Video":
|
|
return 1
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func minInt(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|