This commit is contained in:
@@ -64,6 +64,30 @@
|
||||
- keep the anti-timeout optimization
|
||||
- recover Envato/Artgrid recall when the early pass is too narrow
|
||||
|
||||
## Current Session Update (2026-03-13, HTML Snapshot Analysis)
|
||||
- Used saved HTML snapshots supplied by the user for:
|
||||
- Envato item page
|
||||
- Artgrid clip page
|
||||
- Findings:
|
||||
- Envato page exposes clean `VideoObject` JSON-LD with:
|
||||
- exact asset title
|
||||
- rich description
|
||||
- thumbnail URL
|
||||
- preview mp4 URL
|
||||
- Artgrid page exposes reliable meta fields for:
|
||||
- title
|
||||
- description
|
||||
- thumbnail
|
||||
- canonical URL
|
||||
- Artgrid snapshot still does **not** expose a stable preview mp4 or m3u8 in the saved HTML or downloaded asset bundle inspected here
|
||||
- Fixes applied from the snapshots:
|
||||
- Envato enrichment now prefers `VideoObject` JSON-LD over generic meta tags
|
||||
- Envato search cards should now align much better with the actual source asset and preview
|
||||
- Artgrid title/description are now cleaned so Gemini/source text is less polluted by site suffixes and generic boilerplate
|
||||
- Remaining limitation:
|
||||
- Artgrid hover-video preview cannot be derived reliably from the provided snapshot alone
|
||||
- if Artgrid preview video is still required, the next useful artifact is a browser HAR or DevTools network capture from an opened clip page
|
||||
|
||||
## Local Self-Test Workflow
|
||||
- Primary command:
|
||||
- `bash scripts/selftest.sh`
|
||||
|
||||
+98
-3
@@ -197,17 +197,21 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
|
||||
if err != nil {
|
||||
return result
|
||||
}
|
||||
videoMeta := extractVideoObjectJSONLD(html)
|
||||
result.Title = firstNonEmpty(
|
||||
cleanEnvatoTitle(videoMeta.Name),
|
||||
extractMetaContent(html, "og:title"),
|
||||
result.Title,
|
||||
)
|
||||
result.Snippet = firstNonEmpty(
|
||||
cleanEnvatoDescription(videoMeta.Description),
|
||||
extractMetaContent(html, "og:description"),
|
||||
extractMetaContent(html, "description"),
|
||||
result.Snippet,
|
||||
)
|
||||
|
||||
pageThumbnail := firstNonEmpty(
|
||||
videoMeta.ThumbnailURL,
|
||||
extractMetaContent(html, "og:image"),
|
||||
extractMetaContent(html, "twitter:image"),
|
||||
extractJSONLDValue(html, "thumbnailUrl"),
|
||||
@@ -217,6 +221,7 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
|
||||
}
|
||||
if result.PreviewVideoURL == "" {
|
||||
result.PreviewVideoURL = firstNonEmpty(
|
||||
videoMeta.ContentURL,
|
||||
extractJSONLDValue(html, "contentUrl"),
|
||||
extractMetaContent(html, "twitter:player:stream"),
|
||||
extractVideoPreviewURL(html),
|
||||
@@ -249,12 +254,13 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
|
||||
html, err := s.fetchText(result.Link)
|
||||
if err == nil {
|
||||
result.Title = firstNonEmpty(
|
||||
extractMetaContent(html, "og:title"),
|
||||
cleanArtgridTitle(extractMetaContent(html, "og:title")),
|
||||
cleanArtgridTitle(extractMetaContent(html, "title")),
|
||||
result.Title,
|
||||
)
|
||||
result.Snippet = firstNonEmpty(
|
||||
extractMetaContent(html, "og:description"),
|
||||
extractMetaContent(html, "description"),
|
||||
cleanArtgridDescription(extractMetaContent(html, "og:description")),
|
||||
cleanArtgridDescription(extractMetaContent(html, "description")),
|
||||
result.Snippet,
|
||||
)
|
||||
pageThumbnail := firstNonEmpty(
|
||||
@@ -630,6 +636,95 @@ func extractJSONLDValue(html, key string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
type videoObjectMetadata struct {
|
||||
Name string
|
||||
Description string
|
||||
ThumbnailURL string
|
||||
ContentURL string
|
||||
}
|
||||
|
||||
func extractVideoObjectJSONLD(html string) videoObjectMetadata {
|
||||
pattern := regexp.MustCompile(`(?is)<script[^>]+type=["']application/ld\+json["'][^>]*>(.*?)</script>`)
|
||||
matches := pattern.FindAllStringSubmatch(html, -1)
|
||||
for _, match := range matches {
|
||||
if len(match) != 2 {
|
||||
continue
|
||||
}
|
||||
var payload map[string]any
|
||||
if err := json.Unmarshal([]byte(htmlUnescape(strings.TrimSpace(match[1]))), &payload); err != nil {
|
||||
continue
|
||||
}
|
||||
typeName, _ := payload["@type"].(string)
|
||||
if !strings.EqualFold(typeName, "VideoObject") {
|
||||
continue
|
||||
}
|
||||
meta := videoObjectMetadata{
|
||||
Name: stringValue(payload["name"]),
|
||||
Description: stringValue(payload["description"]),
|
||||
ThumbnailURL: stringValue(payload["thumbnailUrl"]),
|
||||
ContentURL: stringValue(payload["contentUrl"]),
|
||||
}
|
||||
if meta.Name != "" || meta.Description != "" || meta.ThumbnailURL != "" || meta.ContentURL != "" {
|
||||
return meta
|
||||
}
|
||||
}
|
||||
return videoObjectMetadata{}
|
||||
}
|
||||
|
||||
func stringValue(value any) string {
|
||||
switch typed := value.(type) {
|
||||
case string:
|
||||
return htmlUnescape(strings.TrimSpace(typed))
|
||||
case []any:
|
||||
for _, item := range typed {
|
||||
if text := stringValue(item); text != "" {
|
||||
return text
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func cleanEnvatoTitle(title string) string {
|
||||
title = htmlUnescape(strings.TrimSpace(title))
|
||||
return strings.TrimSuffix(title, " - Envato")
|
||||
}
|
||||
|
||||
func cleanEnvatoDescription(description string) string {
|
||||
description = htmlUnescape(strings.TrimSpace(description))
|
||||
description = strings.ReplaceAll(description, "&", "&")
|
||||
return description
|
||||
}
|
||||
|
||||
func cleanArtgridTitle(title string) string {
|
||||
title = htmlUnescape(strings.TrimSpace(title))
|
||||
replacements := []string{
|
||||
" | Stock Video Footage * Artgrid.io*",
|
||||
" | Stock Video Footage - Artgrid.io",
|
||||
" | Royalty Free Stock Footage – Artgrid.io",
|
||||
" | Royalty Free Stock Footage - Artgrid.io",
|
||||
}
|
||||
for _, suffix := range replacements {
|
||||
title = strings.TrimSuffix(title, suffix)
|
||||
}
|
||||
if idx := strings.Index(title, " by "); idx > 0 {
|
||||
title = title[:idx]
|
||||
}
|
||||
return strings.TrimSpace(title)
|
||||
}
|
||||
|
||||
func cleanArtgridDescription(description string) string {
|
||||
description = htmlUnescape(strings.TrimSpace(description))
|
||||
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid.")
|
||||
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid")
|
||||
if parts := strings.SplitN(description, " | ", 2); len(parts) == 2 {
|
||||
description = parts[1]
|
||||
}
|
||||
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage – Artgrid.io")
|
||||
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage - Artgrid.io")
|
||||
return strings.TrimSpace(description)
|
||||
}
|
||||
|
||||
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
|
||||
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
|
||||
if candidate == "" {
|
||||
|
||||
@@ -30,3 +30,22 @@ func TestIsUsefulGoogleVideoResultRejectsMusicResults(t *testing.T) {
|
||||
t.Fatal("expected bgm/music result to be rejected")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractVideoObjectJSONLD(t *testing.T) {
|
||||
html := `<script type="application/ld+json">{"@context":"https://schema.org","@type":"VideoObject","name":"Smiling Man and Woman Waving at Camera","description":"Close up shot of a smiling couple waving.","thumbnailUrl":"https://elements-resized.envatousercontent.com/example/video_preview/video_preview_0001.jpg","contentUrl":"https://video-previews.elements.envatousercontent.com/example/watermarked_preview/watermarked_preview.mp4"}</script>`
|
||||
meta := extractVideoObjectJSONLD(html)
|
||||
if meta.Name != "Smiling Man and Woman Waving at Camera" {
|
||||
t.Fatalf("unexpected name: %#v", meta)
|
||||
}
|
||||
if meta.ContentURL == "" || meta.ThumbnailURL == "" || meta.Description == "" {
|
||||
t.Fatalf("expected full video object metadata, got %#v", meta)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanArtgridTitle(t *testing.T) {
|
||||
got := cleanArtgridTitle("movie film moving slowly from a reel by Arthur Cauty | Royalty Free Stock Footage – Artgrid.io")
|
||||
want := "movie film moving slowly from a reel"
|
||||
if got != want {
|
||||
t.Fatalf("expected %q, got %q", want, got)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user