Improve source parsing from Envato and Artgrid HTML
build-push / docker (push) Successful in 4m28s

This commit is contained in:
AI Assistant
2026-03-13 19:03:21 +09:00
parent 06ea4f3ecd
commit ae091c5a7d
3 changed files with 141 additions and 3 deletions
+98 -3
View File
@@ -197,17 +197,21 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
if err != nil {
return result
}
videoMeta := extractVideoObjectJSONLD(html)
result.Title = firstNonEmpty(
cleanEnvatoTitle(videoMeta.Name),
extractMetaContent(html, "og:title"),
result.Title,
)
result.Snippet = firstNonEmpty(
cleanEnvatoDescription(videoMeta.Description),
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
videoMeta.ThumbnailURL,
extractMetaContent(html, "og:image"),
extractMetaContent(html, "twitter:image"),
extractJSONLDValue(html, "thumbnailUrl"),
@@ -217,6 +221,7 @@ func (s *SearchService) enrichEnvato(result SearchResult) SearchResult {
}
if result.PreviewVideoURL == "" {
result.PreviewVideoURL = firstNonEmpty(
videoMeta.ContentURL,
extractJSONLDValue(html, "contentUrl"),
extractMetaContent(html, "twitter:player:stream"),
extractVideoPreviewURL(html),
@@ -249,12 +254,13 @@ func (s *SearchService) enrichArtgrid(result SearchResult) SearchResult {
html, err := s.fetchText(result.Link)
if err == nil {
result.Title = firstNonEmpty(
extractMetaContent(html, "og:title"),
cleanArtgridTitle(extractMetaContent(html, "og:title")),
cleanArtgridTitle(extractMetaContent(html, "title")),
result.Title,
)
result.Snippet = firstNonEmpty(
extractMetaContent(html, "og:description"),
extractMetaContent(html, "description"),
cleanArtgridDescription(extractMetaContent(html, "og:description")),
cleanArtgridDescription(extractMetaContent(html, "description")),
result.Snippet,
)
pageThumbnail := firstNonEmpty(
@@ -630,6 +636,95 @@ func extractJSONLDValue(html, key string) string {
return ""
}
type videoObjectMetadata struct {
Name string
Description string
ThumbnailURL string
ContentURL string
}
func extractVideoObjectJSONLD(html string) videoObjectMetadata {
pattern := regexp.MustCompile(`(?is)<script[^>]+type=["']application/ld\+json["'][^>]*>(.*?)</script>`)
matches := pattern.FindAllStringSubmatch(html, -1)
for _, match := range matches {
if len(match) != 2 {
continue
}
var payload map[string]any
if err := json.Unmarshal([]byte(htmlUnescape(strings.TrimSpace(match[1]))), &payload); err != nil {
continue
}
typeName, _ := payload["@type"].(string)
if !strings.EqualFold(typeName, "VideoObject") {
continue
}
meta := videoObjectMetadata{
Name: stringValue(payload["name"]),
Description: stringValue(payload["description"]),
ThumbnailURL: stringValue(payload["thumbnailUrl"]),
ContentURL: stringValue(payload["contentUrl"]),
}
if meta.Name != "" || meta.Description != "" || meta.ThumbnailURL != "" || meta.ContentURL != "" {
return meta
}
}
return videoObjectMetadata{}
}
func stringValue(value any) string {
switch typed := value.(type) {
case string:
return htmlUnescape(strings.TrimSpace(typed))
case []any:
for _, item := range typed {
if text := stringValue(item); text != "" {
return text
}
}
}
return ""
}
func cleanEnvatoTitle(title string) string {
title = htmlUnescape(strings.TrimSpace(title))
return strings.TrimSuffix(title, " - Envato")
}
func cleanEnvatoDescription(description string) string {
description = htmlUnescape(strings.TrimSpace(description))
description = strings.ReplaceAll(description, "&amp;", "&")
return description
}
func cleanArtgridTitle(title string) string {
title = htmlUnescape(strings.TrimSpace(title))
replacements := []string{
" | Stock Video Footage * Artgrid.io*",
" | Stock Video Footage - Artgrid.io",
" | Royalty Free Stock Footage Artgrid.io",
" | Royalty Free Stock Footage - Artgrid.io",
}
for _, suffix := range replacements {
title = strings.TrimSuffix(title, suffix)
}
if idx := strings.Index(title, " by "); idx > 0 {
title = title[:idx]
}
return strings.TrimSpace(title)
}
func cleanArtgridDescription(description string) string {
description = htmlUnescape(strings.TrimSpace(description))
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid.")
description = strings.TrimSuffix(description, " Download this royalty free video and other Stunning Stock HD Videos from Artgrid")
if parts := strings.SplitN(description, " | ", 2); len(parts) == 2 {
description = parts[1]
}
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage Artgrid.io")
description = strings.TrimSuffix(description, " | Royalty Free Stock Footage - Artgrid.io")
return strings.TrimSpace(description)
}
func deriveEnvatoPreviewFromThumbnail(thumbnail string) string {
candidate := htmlUnescape(strings.TrimSpace(thumbnail))
if candidate == "" {