Improve source parsing from Envato and Artgrid HTML
build-push / docker (push) Successful in 4m28s

This commit is contained in:
AI Assistant
2026-03-13 19:03:21 +09:00
parent 06ea4f3ecd
commit ae091c5a7d
3 changed files with 141 additions and 3 deletions
+24
View File
@@ -64,6 +64,30 @@
- keep the anti-timeout optimization
- recover Envato/Artgrid recall when the early pass is too narrow
## Current Session Update (2026-03-13, HTML Snapshot Analysis)
- Used saved HTML snapshots supplied by the user for:
- Envato item page
- Artgrid clip page
- Findings:
- Envato page exposes clean `VideoObject` JSON-LD with:
- exact asset title
- rich description
- thumbnail URL
- preview mp4 URL
- Artgrid page exposes reliable meta fields for:
- title
- description
- thumbnail
- canonical URL
- Artgrid snapshot still does **not** expose a stable preview mp4 or m3u8 in the saved HTML or downloaded asset bundle inspected here
- Fixes applied from the snapshots:
- Envato enrichment now prefers `VideoObject` JSON-LD over generic meta tags
- Envato search cards should now align much better with the actual source asset and preview
- Artgrid title/description are now cleaned so Gemini/source text is less polluted by site suffixes and generic boilerplate
- Remaining limitation:
- Artgrid hover-video preview cannot be derived reliably from the provided snapshot alone
- if Artgrid preview video is still required, the next useful artifact is a browser HAR or DevTools network capture from an opened clip page
## Local Self-Test Workflow
- Primary command:
- `bash scripts/selftest.sh`
+98 -3
View File
@@ -197,17 +197,21 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
if err != nil {
return result
}
videoMeta := extractVideoObjectJSONLD(html)
result.Title = firstNonEmpty(
cleanEnvatoTitle(videoMeta.Name),
extractMetaContent(html, "og:title"),
result.Title,
)
result.Snippet = firstNonEmpty(
cleanEnvatoDescription(videoMeta.Description),
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
videoMeta.ThumbnailURL,
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
extractJSONLDValue(html, "thumbnailUrl"),
@@ -217,6 +221,7 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = firstNonEmpty(
videoMeta.ContentURL,
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractVideoPreviewURL(html),
@@ -249,12 +254,13 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
html, err := s.fetchText(result.Link)
if err == nil {
result.Title = firstNonEmpty(
extractMetaContent(html, "og:title"),
cleanArtgridTitle(extractMetaContent(html, "og:title")),
cleanArtgridTitle(extractMetaContent(html, "title")),
result.Title,
)
result.Snippet = firstNonEmpty(
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
cleanArtgridDescription(extractMetaContent(html, "og:description")),
cleanArtgridDescription(extractMetaContent(html, "description")),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
@@ -630,6 +636,95 @@ func extractJSONLDValue(html, key string) string {
return ""
}
type videoObjectMetadata struct {
Name string
Description string
ThumbnailURL string
ContentURL string
}
func extractVideoObjectJSONLD(html string) videoObjectMetadata {
pattern := regexp.MustCompile(`(?is)<script[^>]+type=["']application/ld\+json["'][^>]*>(.*?)</script>`)
matches := pattern.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) != 2 {
continue
}
var payload map[string]any
if err := json.Unmarshal([]byte(htmlUnescape(strings.TrimSpace(match[1]))), &payload); err != nil {
continue
}
typeName, _ := payload["@type"].(string)
if !strings.EqualFold(typeName, "VideoObject") {
continue
}
meta := videoObjectMetadata{
Name: stringValue(payload["name"]),
Description: stringValue(payload["description"]),
ThumbnailURL: stringValue(payload["thumbnailUrl"]),
ContentURL: stringValue(payload["contentUrl"]),
}
if meta.Name != "" || meta.Description != "" || meta.ThumbnailURL != "" || meta.ContentURL != "" {
return meta
}
}
return videoObjectMetadata{}
}
func stringValue(value any) string {
switch typed := value.(type) {
case string:
return htmlUnescape(strings.TrimSpace(typed))
case []any:
for _, item := range typed {
if text := stringValue(item); text != "" {
return text
}
}
}
return ""
}
func cleanEnvatoTitle(title string) string {
title = htmlUnescape(strings.TrimSpace(title))
return strings.TrimSuffix(title, " - Envato")
}
func cleanEnvatoDescription(description string) string {
description = htmlUnescape(strings.TrimSpace(description))
description = strings.ReplaceAll(description, "&amp;", "&")
return description
}
func cleanArtgridTitle(title string) string {
title = htmlUnescape(strings.TrimSpace(title))
replacements := []string{
" | Stock Video Footage * Artgrid.io*",
" | Stock Video Footage - Artgrid.io",
" | Royalty Free Stock Footage Artgrid.io",
" | Royalty Free Stock Footage - Artgrid.io",
}
for _, suffix := range replacements {
title = strings.TrimSuffix(title, suffix)
}
if idx := strings.Index(title, " by "); idx > 0 {
title = title[:idx]
}
return strings.TrimSpace(title)
}
func cleanArtgridDescription(description string) string {
description = htmlUnescape(strings.TrimSpace(description))
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid.")
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid")
if parts := strings.SplitN(description, " | ", 2); len(parts) == 2 {
description = parts[1]
}
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage Artgrid.io")
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage - Artgrid.io")
return strings.TrimSpace(description)
}
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
if candidate == "" {
+19
View File
@@ -30,3 +30,22 @@ func TestIsUsefulGoogleVideoResultRejectsMusicResults(t *testing.T) {
t.Fatal("expected bgm/music result to be rejected")
}
}
func TestExtractVideoObjectJSONLD(t *testing.T) {
html := `<script type="application/ld+json">{"@context":"https://schema.org","@type":"VideoObject","name":"Smiling Man and Woman Waving at Camera","description":"Close up shot of a smiling couple waving.","thumbnailUrl":"https://elements-resized.envatousercontent.com/example/video_preview/video_preview_0001.jpg","contentUrl":"https://video-previews.elements.envatousercontent.com/example/watermarked_preview/watermarked_preview.mp4"}</script>`
meta := extractVideoObjectJSONLD(html)
if meta.Name != "Smiling Man and Woman Waving at Camera" {
t.Fatalf("unexpected name: %#v", meta)
}
if meta.ContentURL == "" || meta.ThumbnailURL == "" || meta.Description == "" {
t.Fatalf("expected full video object metadata, got %#v", meta)
}
}
func TestCleanArtgridTitle(t *testing.T) {
got := cleanArtgridTitle("movie film moving slowly from a reel by Arthur Cauty | Royalty Free Stock Footage Artgrid.io")
want := "movie film moving slowly from a reel"
if got != want {
t.Fatalf("expected %q, got %q", want, got)
}
}