diff options
| -rw-r--r-- | utils/mal/anime.go | 184 | ||||
| -rw-r--r-- | utils/mal/client.go | 12 | ||||
| -rw-r--r-- | utils/mal/episodes.go | 218 | ||||
| -rw-r--r-- | utils/mal/types.go | 7 |
4 files changed, 333 insertions, 88 deletions
diff --git a/utils/mal/anime.go b/utils/mal/anime.go index 08bf841..2939245 100644 --- a/utils/mal/anime.go +++ b/utils/mal/anime.go @@ -17,8 +17,8 @@ var ( youtubeIDPattern = regexp.MustCompile(`/embed/([a-zA-Z0-9_-]+)`) themeSongTitlePattern = regexp.MustCompile(`"(.+?)"`) themeSongArtistPattern = regexp.MustCompile(`by\s+(.+?)(?:\s+\(eps|\s*$)`) - themeSongEpisodesPattern = regexp.MustCompile(`\(eps\s+(\d+)(?:-(\d+))?\)`) - japaneseTextInParensPattern = regexp.MustCompile(`\(([^\x00-\x7F]+)\)`) + themeSongEpisodesPattern = regexp.MustCompile(`\(eps\s+([\d,\-\s]+)\)`) + japaneseTextInParensPattern = regexp.MustCompile(`\(([^)]*[^\x00-\x7F][^)]*)\)`) broadcastTimePattern = regexp.MustCompile(`(\w+)s?\s+at\s+(\d{2}:\d{2})\s+\((\w+)\)`) imageResizePrefixPattern = regexp.MustCompile(`/r/\d+x\d+`) leadingIndexPattern = regexp.MustCompile(`^#?\d+:?\s*`) @@ -33,6 +33,8 @@ func extractSidebarValue(document *goquery.Document, label string) string { if strings.TrimSpace(selection.Text()) == label { parentClone := selection.Parent().Clone() parentClone.Find("span.dark_text").Remove() + parentClone.Find("sup").Remove() + parentClone.Find("div").Remove() extractedValue = strings.TrimSpace(parentClone.Text()) } }) @@ -86,36 +88,20 @@ func buildImageFromBaseURL(rawURL string) Image { pathBase := cleanedURL[:extensionIndex] return Image{ - JPG: ImageFormat{ - Small: pathBase + "t.jpg", - Medium: pathBase + ".jpg", - Large: pathBase + "l.jpg", - Original: pathBase + ".jpg", - }, - WEBP: ImageFormat{ - Small: pathBase + "t.webp", - Medium: pathBase + ".webp", - Large: pathBase + "l.webp", - Original: pathBase + ".webp", - }, + Small: pathBase + "t.jpg", + Medium: pathBase + ".jpg", + Large: pathBase + "l.jpg", + Original: pathBase + ".jpg", } } func buildYouTubeThumbnail(videoID string) Image { thumbnailBase := fmt.Sprintf("https://img.youtube.com/vi/%s", videoID) return Image{ - JPG: ImageFormat{ - Small: thumbnailBase + "/default.jpg", - Medium: thumbnailBase + "/mqdefault.jpg", - Large: thumbnailBase + "/hqdefault.jpg", - Original: thumbnailBase + "/maxresdefault.jpg", - }, - WEBP: ImageFormat{ - Small: thumbnailBase + "/default.webp", - Medium: thumbnailBase + "/mqdefault.webp", - Large: thumbnailBase + "/hqdefault.webp", - Original: thumbnailBase + "/maxresdefault.webp", - }, + Small: thumbnailBase + "/default.jpg", + Medium: thumbnailBase + "/mqdefault.jpg", + Large: thumbnailBase + "/hqdefault.jpg", + Original: thumbnailBase + "/maxresdefault.jpg", } } @@ -210,18 +196,38 @@ func parseAnimeSynopsis(document *goquery.Document) string { } func parseAnimeBackground(document *goquery.Document) string { + backgroundHeading := document.Find("h2#background") + if backgroundHeading.Length() == 0 { + return "" + } + + wrapperDiv := backgroundHeading.Parent() + container := wrapperDiv.Parent() + + var foundWrapper bool + var reachedEnd bool var backgroundParts []string - document.Find("h2").Each(func(index int, heading *goquery.Selection) { - if strings.TrimSpace(heading.Text()) != "Background" { + + container.Contents().Each(func(index int, node *goquery.Selection) { + if reachedEnd { return } - heading.NextUntil("h2").Each(func(siblingIndex int, sibling *goquery.Selection) { - text := strings.TrimSpace(sibling.Text()) - if text != "" && !strings.Contains(text, "No background information") { - backgroundParts = append(backgroundParts, text) + if !foundWrapper { + if node.Find("h2#background").Length() > 0 { + foundWrapper = true } - }) + return + } + if node.HasClass("border_top") { + reachedEnd = true + return + } + text := strings.TrimSpace(node.Text()) + if text != "" && !strings.Contains(text, "No background information") { + backgroundParts = append(backgroundParts, text) + } }) + return strings.Join(backgroundParts, " ") } @@ -289,25 +295,41 @@ func parseAnimeBroadcast(document *goquery.Document) Broadcast { func parseAnimeThemeSongs(document *goquery.Document, containerClass string) []ThemeSong { var themeSongs []ThemeSong - document.Find(fmt.Sprintf("div.%s table tr", containerClass)).Each(func(index int, row *goquery.Selection) { - songText := strings.TrimSpace(row.Find("td.theme-song").Text()) - if songText == "" || strings.Contains(songText, "No opening themes") || strings.Contains(songText, "No ending themes") { + document.Find(fmt.Sprintf("div.%s > table tr", containerClass)).Each(func(index int, row *goquery.Selection) { + songCell := row.Find("td").Eq(1) + if songCell.Length() == 0 { + songCell = row.Find("td").First() + } + + if songCell.Find("span.theme-song-index").Length() == 0 { + cellText := strings.TrimSpace(songCell.Text()) + if cellText == "" || !strings.Contains(cellText, `"`) { + return + } + } + + cellText := strings.TrimSpace(songCell.Text()) + if cellText == "" || strings.Contains(cellText, "No opening themes") || strings.Contains(cellText, "No ending themes") { return } - themeSong := parseThemeSongText(songText) + themeSong := parseThemeSongText(cellText) - row.Find("td.theme-song-artist a").Each(func(linkIndex int, linkElement *goquery.Selection) { - href, exists := linkElement.Attr("href") - if !exists || href == "" { + songCell.Find("input[type=hidden]").Each(func(inputIndex int, inputElement *goquery.Selection) { + inputID, _ := inputElement.Attr("id") + inputValue, _ := inputElement.Attr("value") + if inputValue == "" { return } - siteName := strings.TrimSpace(linkElement.Text()) - if siteName == "" { - siteName, _ = linkElement.Attr("title") - } - if siteName != "" { - themeSong.Links = append(themeSong.Links, ExternalLink{Name: siteName, URL: href}) + switch { + case strings.HasPrefix(inputID, "spotify_url"): + themeSong.Links = append(themeSong.Links, ExternalLink{Name: "Spotify", URL: inputValue}) + case strings.HasPrefix(inputID, "apple_url"): + themeSong.Links = append(themeSong.Links, ExternalLink{Name: "Apple Music", URL: inputValue}) + case strings.HasPrefix(inputID, "amazon_url"): + themeSong.Links = append(themeSong.Links, ExternalLink{Name: "Amazon Music", URL: inputValue}) + case strings.HasPrefix(inputID, "youtube_url"): + themeSong.Links = append(themeSong.Links, ExternalLink{Name: "YouTube", URL: inputValue}) } }) @@ -322,11 +344,10 @@ func parseThemeSongText(rawText string) ThemeSong { episodeMatches := themeSongEpisodesPattern.FindStringSubmatch(text) if len(episodeMatches) > 1 { - themeSong.Episodes.Start, _ = strconv.Atoi(episodeMatches[1]) - if len(episodeMatches) > 2 && episodeMatches[2] != "" { - themeSong.Episodes.End, _ = strconv.Atoi(episodeMatches[2]) - } else { - themeSong.Episodes.End = themeSong.Episodes.Start + allNumbers := regexp.MustCompile(`\d+`).FindAllString(episodeMatches[1], -1) + if len(allNumbers) > 0 { + themeSong.Episodes.Start, _ = strconv.Atoi(allNumbers[0]) + themeSong.Episodes.End, _ = strconv.Atoi(allNumbers[len(allNumbers)-1]) } } @@ -352,12 +373,15 @@ func parseThemeSongText(rawText string) ThemeSong { func parseAnimeExternalLinks(document *goquery.Document) []ExternalLink { var externalLinks []ExternalLink - document.Find("div.external_links a.link").Each(func(index int, linkElement *goquery.Selection) { + document.Find("div.external_links a").Each(func(index int, linkElement *goquery.Selection) { href, exists := linkElement.Attr("href") - if !exists || href == "" { + if !exists || href == "" || strings.HasPrefix(href, "#") || strings.HasPrefix(href, "javascript:") { return } - linkName := strings.TrimSpace(linkElement.Text()) + linkName := strings.TrimSpace(linkElement.Find("div.caption").Text()) + if linkName == "" { + linkName = strings.TrimSpace(linkElement.Text()) + } if linkName != "" { externalLinks = append(externalLinks, ExternalLink{Name: linkName, URL: href}) } @@ -367,24 +391,18 @@ func parseAnimeExternalLinks(document *goquery.Document) []ExternalLink { func parseAnimeStreamingLinks(document *goquery.Document) []ExternalLink { var streamingLinks []ExternalLink - document.Find("h2").Each(func(index int, heading *goquery.Selection) { - headingText := strings.TrimSpace(heading.Text()) - if headingText != "Available At" && headingText != "Streaming Platforms" { + document.Find("div.broadcasts a.broadcast-item").Each(func(index int, linkElement *goquery.Selection) { + href, exists := linkElement.Attr("href") + if !exists || href == "" || strings.HasPrefix(href, "javascript:") { return } - heading.NextUntil("h2").Find("a").Each(func(linkIndex int, linkElement *goquery.Selection) { - href, exists := linkElement.Attr("href") - if !exists || href == "" { - return - } - linkName := strings.TrimSpace(linkElement.Text()) - if linkName == "" { - linkName, _ = linkElement.Attr("title") - } - if linkName != "" { - streamingLinks = append(streamingLinks, ExternalLink{Name: linkName, URL: href}) - } - }) + linkName, _ := linkElement.Attr("title") + if linkName == "" { + linkName = strings.TrimSpace(linkElement.Find("div.caption").Text()) + } + if linkName != "" { + streamingLinks = append(streamingLinks, ExternalLink{Name: linkName, URL: href}) + } }) return streamingLinks } @@ -494,6 +512,16 @@ func parseAnimeDocument(document *goquery.Document, malID int) Anime { } } +func fixThemeSongEpisodeRanges(themeSongs []ThemeSong, totalEpisodes int) { + if len(themeSongs) != 1 || totalEpisodes <= 0 { + return + } + if themeSongs[0].Episodes.Start == 0 && themeSongs[0].Episodes.End == 0 { + themeSongs[0].Episodes.Start = 1 + themeSongs[0].Episodes.End = totalEpisodes + } +} + func GetAnimeByMALID(malID int) (*Anime, error) { animePageURL := fmt.Sprintf("%s/anime/%d", malBaseURL, malID) animeDocument, fetchErr := makeRequest(animePageURL) @@ -502,8 +530,14 @@ func GetAnimeByMALID(malID int) (*Anime, error) { return nil, fmt.Errorf("failed to fetch anime page for MAL ID %d: %w", malID, fetchErr) } + logger.Debugf("MALScraper", "Parsing anime page for MAL ID %d", malID) anime := parseAnimeDocument(animeDocument, malID) + logger.Debugf("MALScraper", "Parsed anime page: Title=%q, EpisodeCount=%d", anime.Title.Romaji, anime.EpisodeCount) + + fixThemeSongEpisodeRanges(anime.Openings, anime.EpisodeCount) + fixThemeSongEpisodeRanges(anime.Endings, anime.EpisodeCount) + logger.Debugf("MALScraper", "Fetching videos page for MAL ID %d", malID) videosPageURL := fmt.Sprintf("%s/anime/%d/_/video", malBaseURL, malID) videosDocument, videosFetchErr := makeRequest(videosPageURL) if videosFetchErr != nil { @@ -511,6 +545,16 @@ func GetAnimeByMALID(malID int) (*Anime, error) { } else { anime.Videos = parsePromotionalVideos(videosDocument) anime.MusicVideos = parseMusicVideos(videosDocument) + logger.Debugf("MALScraper", "Parsed videos: %d promotional, %d music", len(anime.Videos), len(anime.MusicVideos)) + } + + logger.Debugf("MALScraper", "Fetching episodes for MAL ID %d", malID) + episodes, episodesFetchErr := GetAnimeEpisodesByMALID(malID) + if episodesFetchErr != nil { + logger.Warnf("MALClient", "Failed to fetch episodes for MAL ID %d: %v", malID, episodesFetchErr) + } else { + anime.Episodes = episodes + logger.Debugf("MALScraper", "Fetched %d episodes for MAL ID %d", len(episodes), malID) } return &anime, nil diff --git a/utils/mal/client.go b/utils/mal/client.go index 9bc5d98..1cbb26a 100644 --- a/utils/mal/client.go +++ b/utils/mal/client.go @@ -67,16 +67,6 @@ func makeRequest(targetURL string) (*goquery.Document, error) { if parseErr != nil { return nil, fmt.Errorf("failed to parse HTML from %s: %w", targetURL, parseErr) } - - pageTitle := document.Find("title").Text() - logger.Debugf("MALClient", "Page title for %s: %q", targetURL, pageTitle) - - htmlContent, _ := document.Html() - if len(htmlContent) > 500 { - htmlContent = htmlContent[:500] - } - logger.Debugf("MALClient", "HTML preview for %s: %s", targetURL, htmlContent) - return document, nil } @@ -107,4 +97,4 @@ func makeRequest(targetURL string) (*goquery.Document, error) { func getBackoffDuration(attempt int) time.Duration { exponentialDelay := time.Duration(float64(backoffBase) * math.Pow(2, float64(attempt-1))) return cfbypass.AddJitter(exponentialDelay) -}
\ No newline at end of file +} diff --git a/utils/mal/episodes.go b/utils/mal/episodes.go index ff61686..8a22259 100644 --- a/utils/mal/episodes.go +++ b/utils/mal/episodes.go @@ -1,14 +1,46 @@ package mal import ( + "encoding/json" "fmt" "metachan/utils/logger" + "regexp" "strconv" "strings" "github.com/PuerkitoBio/goquery" ) +var ( + crunchyrollOldCDNPattern = regexp.MustCompile(`img\d*\.ak\.crunchyroll\.com/i/spire\d*-tmb/([a-f0-9]{32})`) + episodeScorePattern = regexp.MustCompile(`(\d+\.\d+)`) +) + +func containsNonLatinCharacters(text string) bool { + for _, runeValue := range text { + if runeValue > 127 { + return true + } + } + return false +} + +func splitAlternativeTitle(alternativeTitle string) (romajiTitle string, japaneseTitle string) { + if alternativeTitle == "" { + return "", "" + } + japaneseMatches := japaneseTextInParensPattern.FindStringSubmatch(alternativeTitle) + if len(japaneseMatches) > 1 { + japaneseTitle = japaneseMatches[1] + romajiTitle = strings.TrimSpace(japaneseTextInParensPattern.ReplaceAllString(alternativeTitle, "")) + return romajiTitle, japaneseTitle + } + if containsNonLatinCharacters(alternativeTitle) { + return "", alternativeTitle + } + return alternativeTitle, "" +} + func parseEpisodeRow(row *goquery.Selection) Episode { numberText := strings.TrimSpace(row.Find("td.episode-number").Text()) episodeNumber, _ := strconv.Atoi(numberText) @@ -18,10 +50,18 @@ func parseEpisodeRow(row *goquery.Selection) Episode { episodeURL, _ := titleLink.Attr("href") englishTitle := strings.TrimSpace(titleLink.Text()) - japaneseTitle := strings.TrimSpace(titleCell.Find("span.di-ib").Text()) + alternativeTitle := strings.TrimSpace(titleCell.Find("span.di-ib").Text()) + romajiTitle, japaneseTitle := splitAlternativeTitle(alternativeTitle) airedText := strings.TrimSpace(row.Find("td.episode-aired").Text()) + pollText := strings.TrimSpace(row.Find("td.episode-poll").Text()) + var episodeScore float64 + scoreMatches := episodeScorePattern.FindStringSubmatch(pollText) + if len(scoreMatches) > 1 { + episodeScore, _ = strconv.ParseFloat(scoreMatches[1], 64) + } + forumLink := row.Find("td.episode-forum a") forumURL, _ := forumLink.Attr("href") @@ -34,20 +74,194 @@ func parseEpisodeRow(row *goquery.Selection) Episode { Title: Title{ English: englishTitle, Japanese: japaneseTitle, + Romaji: romajiTitle, }, Aired: parseAiredDateString(airedText), + Score: episodeScore, ForumURL: forumURL, Filler: fillerTag.Length() > 0, Recap: recapTag.Length() > 0, } } +type aroundVideoEntry struct { + EpisodeNumber int `json:"episode_number"` + Thumbnail string `json:"thumbnail"` +} + +func extractEpisodeThumbnailsFromScript(document *goquery.Document) map[int]string { + thumbnails := make(map[int]string) + + document.Find("script").Each(func(index int, scriptElement *goquery.Selection) { + scriptContent := scriptElement.Text() + if !strings.Contains(scriptContent, "aroundVideos") { + return + } + + videosStartIndex := strings.Index(scriptContent, `videos`) + if videosStartIndex == -1 { + return + } + + bracketStartIndex := strings.Index(scriptContent[videosStartIndex:], "[") + if bracketStartIndex == -1 { + return + } + + arrayStartIndex := videosStartIndex + bracketStartIndex + bracketDepth := 0 + arrayEndIndex := -1 + for charIndex := arrayStartIndex; charIndex < len(scriptContent); charIndex++ { + if scriptContent[charIndex] == '[' { + bracketDepth++ + } else if scriptContent[charIndex] == ']' { + bracketDepth-- + if bracketDepth == 0 { + arrayEndIndex = charIndex + 1 + break + } + } + } + + if arrayEndIndex == -1 { + return + } + + videosJSON := scriptContent[arrayStartIndex:arrayEndIndex] + var videoEntries []aroundVideoEntry + if unmarshalErr := json.Unmarshal([]byte(videosJSON), &videoEntries); unmarshalErr != nil { + return + } + + for _, entry := range videoEntries { + if entry.Thumbnail != "" { + unescapedURL := strings.ReplaceAll(entry.Thumbnail, `\/`, `/`) + thumbnails[entry.EpisodeNumber] = unescapedURL + } + } + }) + + return thumbnails +} + +func buildCrunchyrollThumbnail(thumbnailHash string) Image { + cdnBase := "https://imgsrv.crunchyroll.com/cdn-cgi/image/fit=contain,format=auto,quality=70" + imagePath := fmt.Sprintf("/catalog/crunchyroll/%s.jpg", thumbnailHash) + + return Image{ + Small: fmt.Sprintf("%s,width=320%s", cdnBase, imagePath), + Medium: fmt.Sprintf("%s,width=640%s", cdnBase, imagePath), + Large: fmt.Sprintf("%s,width=1280%s", cdnBase, imagePath), + Original: fmt.Sprintf("%s,width=1920%s", cdnBase, imagePath), + } +} + +func buildEpisodeThumbnail(rawURL string) Image { + crunchyrollMatches := crunchyrollOldCDNPattern.FindStringSubmatch(rawURL) + if len(crunchyrollMatches) > 1 { + return buildCrunchyrollThumbnail(crunchyrollMatches[1]) + } + return Image{ + Original: rawURL, + } +} + +func extractEpisodeSynopsis(document *goquery.Document) string { + var synopsisText string + + document.Find("h2").Each(func(index int, headingElement *goquery.Selection) { + if synopsisText != "" { + return + } + if !strings.Contains(headingElement.Text(), "Synopsis") { + return + } + + nextSibling := headingElement.Next() + for nextSibling.Length() > 0 { + tagName := goquery.NodeName(nextSibling) + if tagName == "h2" || tagName == "h3" || tagName == "br" { + break + } + if nextSibling.HasClass("border_top") { + break + } + text := strings.TrimSpace(nextSibling.Text()) + if text != "" && !strings.Contains(text, "No synopsis information") { + synopsisText = text + return + } + nextSibling = nextSibling.Next() + } + }) + + if synopsisText == "" { + metaDescription, exists := document.Find(`meta[property="og:description"]`).Attr("content") + if exists { + trimmedDescription := strings.TrimSpace(metaDescription) + if trimmedDescription != "" && !strings.Contains(trimmedDescription, "No synopsis information") { + synopsisText = trimmedDescription + } + } + } + + return synopsisText +} + +func enrichEpisodesWithDetails(episodes []Episode, malID int) { + if len(episodes) == 0 { + return + } + + logger.Debugf("MALScraper", "Enriching %d episodes with details for MAL ID %d", len(episodes), malID) + + thumbnailMap := make(map[int]string) + thumbnailsExtracted := false + + for episodeIndex := range episodes { + if episodes[episodeIndex].URL == "" { + continue + } + + logger.Debugf("MALScraper", "Fetching episode %d/%d detail page for MAL ID %d", + episodes[episodeIndex].Number, len(episodes), malID) + + episodeDocument, fetchErr := makeRequest(episodes[episodeIndex].URL) + if fetchErr != nil { + logger.Warnf("MALClient", "Failed to fetch episode %d detail page for MAL ID %d: %v", + episodes[episodeIndex].Number, malID, fetchErr) + continue + } + + if !thumbnailsExtracted { + thumbnailMap = extractEpisodeThumbnailsFromScript(episodeDocument) + thumbnailsExtracted = true + logger.Debugf("MALScraper", "Extracted %d episode thumbnails from aroundVideos script", len(thumbnailMap)) + } + + episodes[episodeIndex].Synopsis = extractEpisodeSynopsis(episodeDocument) + + if thumbnailURL, exists := thumbnailMap[episodes[episodeIndex].Number]; exists { + episodes[episodeIndex].Preview = Preview{ + URL: episodes[episodeIndex].URL, + Thumbnail: buildEpisodeThumbnail(thumbnailURL), + } + logger.Debugf("MALScraper", "Episode %d: synopsis=%d chars, thumbnail=yes", + episodes[episodeIndex].Number, len(episodes[episodeIndex].Synopsis)) + } else { + logger.Debugf("MALScraper", "Episode %d: synopsis=%d chars, thumbnail=no", + episodes[episodeIndex].Number, len(episodes[episodeIndex].Synopsis)) + } + } +} + func GetAnimeEpisodesByMALID(malID int) ([]Episode, error) { var allEpisodes []Episode offset := 0 for { pageURL := fmt.Sprintf("%s/anime/%d/_/episode?offset=%d", malBaseURL, malID, offset) + logger.Debugf("MALScraper", "Fetching episode list page at offset %d for MAL ID %d", offset, malID) document, fetchErr := makeRequest(pageURL) if fetchErr != nil { if len(allEpisodes) > 0 { @@ -78,5 +292,7 @@ func GetAnimeEpisodesByMALID(malID int) ([]Episode, error) { offset += 100 } + enrichEpisodesWithDetails(allEpisodes, malID) + return allEpisodes, nil }
\ No newline at end of file diff --git a/utils/mal/types.go b/utils/mal/types.go index 9226072..241bc4e 100644 --- a/utils/mal/types.go +++ b/utils/mal/types.go @@ -1,17 +1,12 @@ package mal -type ImageFormat struct { +type Image struct { Small string Medium string Large string Original string } -type Image struct { - JPG ImageFormat - WEBP ImageFormat -} - type Title struct { English string Japanese string |
