From ae091c5a7d320fe3a7963fb254aa76e0a19fb969 Mon Sep 17 00:00:00 2001 From: AI Assistant Date: Fri, 13 Mar 2026 19:03:21 +0900 Subject: [PATCH] Improve source parsing from Envato and Artgrid HTML --- TODO.md | 24 +++++++++ backend/services/cse.go | 101 +++++++++++++++++++++++++++++++++-- backend/services/cse_test.go | 19 +++++++ 3 files changed, 141 insertions(+), 3 deletions(-) diff --git a/TODO.md b/TODO.md index 92c885b..3a367c4 100644 --- a/TODO.md +++ b/TODO.md @@ -64,6 +64,30 @@ - keep the anti-timeout optimization - recover Envato/Artgrid recall when the early pass is too narrow +## Current Session Update (2026-03-13, HTML Snapshot Analysis) +- Used saved HTML snapshots supplied by the user for: + - Envato item page + - Artgrid clip page +- Findings: + - Envato page exposes clean `VideoObject` JSON-LD with: + - exact asset title + - rich description + - thumbnail URL + - preview mp4 URL + - Artgrid page exposes reliable meta fields for: + - title + - description + - thumbnail + - canonical URL + - Artgrid snapshot still does **not** expose a stable preview mp4 or m3u8 in the saved HTML or downloaded asset bundle inspected here +- Fixes applied from the snapshots: + - Envato enrichment now prefers `VideoObject` JSON-LD over generic meta tags + - Envato search cards should now align much better with the actual source asset and preview + - Artgrid title/description are now cleaned so Gemini/source text is less polluted by site suffixes and generic boilerplate +- Remaining limitation: + - Artgrid hover-video preview cannot be derived reliably from the provided snapshot alone + - if Artgrid preview video is still required, the next useful artifact is a browser HAR or DevTools network capture from an opened clip page + ## Local Self-Test Workflow - Primary command: - `bash scripts/selftest.sh` diff --git a/backend/services/cse.go b/backend/services/cse.go index 7b74180..0d657c2 100644 --- a/backend/services/cse.go +++ b/backend/services/cse.go @@ -197,17 +197,21 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult { if err != nil { return result } + videoMeta := extractVideoObjectJSONLD(html) result.Title = firstNonEmpty( + cleanEnvatoTitle(videoMeta.Name), extractMetaContent(html, "og:title"), result.Title, ) result.Snippet = firstNonEmpty( + cleanEnvatoDescription(videoMeta.Description), extractMetaContent(html, "og:description"), extractMetaContent(html, "description"), result.Snippet, ) pageThumbnail := firstNonEmpty( + videoMeta.ThumbnailURL, extractMetaContent(html, "og:image"), extractMetaContent(html, "twitter:image"), extractJSONLDValue(html, "thumbnailUrl"), @@ -217,6 +221,7 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult { } if result.PreviewVideoURL == "" { result.PreviewVideoURL = firstNonEmpty( + videoMeta.ContentURL, extractJSONLDValue(html, "contentUrl"), extractMetaContent(html, "twitter:player:stream"), extractVideoPreviewURL(html), @@ -249,12 +254,13 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult { html, err := s.fetchText(result.Link) if err == nil { result.Title = firstNonEmpty( - extractMetaContent(html, "og:title"), + cleanArtgridTitle(extractMetaContent(html, "og:title")), + cleanArtgridTitle(extractMetaContent(html, "title")), result.Title, ) result.Snippet = firstNonEmpty( - extractMetaContent(html, "og:description"), - extractMetaContent(html, "description"), + cleanArtgridDescription(extractMetaContent(html, "og:description")), + cleanArtgridDescription(extractMetaContent(html, "description")), result.Snippet, ) pageThumbnail := firstNonEmpty( @@ -630,6 +636,95 @@ func extractJSONLDValue(html, key string) string { return "" } +type videoObjectMetadata struct { + Name string + Description string + ThumbnailURL string + ContentURL string +} + +func extractVideoObjectJSONLD(html string) videoObjectMetadata { + pattern := regexp.MustCompile(`(?is)]+type=["']application/ld\+json["'][^>]*>(.*?)`) + matches := pattern.FindAllStringSubmatch(html, -1) + for _, match := range matches { + if len(match) != 2 { + continue + } + var payload map[string]any + if err := json.Unmarshal([]byte(htmlUnescape(strings.TrimSpace(match[1]))), &payload); err != nil { + continue + } + typeName, _ := payload["@type"].(string) + if !strings.EqualFold(typeName, "VideoObject") { + continue + } + meta := videoObjectMetadata{ + Name: stringValue(payload["name"]), + Description: stringValue(payload["description"]), + ThumbnailURL: stringValue(payload["thumbnailUrl"]), + ContentURL: stringValue(payload["contentUrl"]), + } + if meta.Name != "" || meta.Description != "" || meta.ThumbnailURL != "" || meta.ContentURL != "" { + return meta + } + } + return videoObjectMetadata{} +} + +func stringValue(value any) string { + switch typed := value.(type) { + case string: + return htmlUnescape(strings.TrimSpace(typed)) + case []any: + for _, item := range typed { + if text := stringValue(item); text != "" { + return text + } + } + } + return "" +} + +func cleanEnvatoTitle(title string) string { + title = htmlUnescape(strings.TrimSpace(title)) + return strings.TrimSuffix(title, " - Envato") +} + +func cleanEnvatoDescription(description string) string { + description = htmlUnescape(strings.TrimSpace(description)) + description = strings.ReplaceAll(description, "&", "&") + return description +} + +func cleanArtgridTitle(title string) string { + title = htmlUnescape(strings.TrimSpace(title)) + replacements := []string{ + " | Stock Video Footage * Artgrid.io*", + " | Stock Video Footage - Artgrid.io", + " | Royalty Free Stock Footage – Artgrid.io", + " | Royalty Free Stock Footage - Artgrid.io", + } + for _, suffix := range replacements { + title = strings.TrimSuffix(title, suffix) + } + if idx := strings.Index(title, " by "); idx > 0 { + title = title[:idx] + } + return strings.TrimSpace(title) +} + +func cleanArtgridDescription(description string) string { + description = htmlUnescape(strings.TrimSpace(description)) + description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid.") + description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid") + if parts := strings.SplitN(description, " | ", 2); len(parts) == 2 { + description = parts[1] + } + description = strings.TrimSuffix(description, " | Royalty Free Stock Footage – Artgrid.io") + description = strings.TrimSuffix(description, " | Royalty Free Stock Footage - Artgrid.io") + return strings.TrimSpace(description) +} + func deriveEnvatoPreviewFromThumbnail(thumbnail string) string { candidate := htmlUnescape(strings.TrimSpace(thumbnail)) if candidate == "" { diff --git a/backend/services/cse_test.go b/backend/services/cse_test.go index 0e790c3..493c85e 100644 --- a/backend/services/cse_test.go +++ b/backend/services/cse_test.go @@ -30,3 +30,22 @@ func TestIsUsefulGoogleVideoResultRejectsMusicResults(t *testing.T) { t.Fatal("expected bgm/music result to be rejected") } } + +func TestExtractVideoObjectJSONLD(t *testing.T) { + html := `` + meta := extractVideoObjectJSONLD(html) + if meta.Name != "Smiling Man and Woman Waving at Camera" { + t.Fatalf("unexpected name: %#v", meta) + } + if meta.ContentURL == "" || meta.ThumbnailURL == "" || meta.Description == "" { + t.Fatalf("expected full video object metadata, got %#v", meta) + } +} + +func TestCleanArtgridTitle(t *testing.T) { + got := cleanArtgridTitle("movie film moving slowly from a reel by Arthur Cauty | Royalty Free Stock Footage – Artgrid.io") + want := "movie film moving slowly from a reel" + if got != want { + t.Fatalf("expected %q, got %q", want, got) + } +}