Files
wowdoc-scraper/html-parser.go
2024-11-05 22:42:46 +01:00

50 lines
1.1 KiB
Go

package main
import (
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
)
func ParseHTML(html string) ([]string, error) {
res := []string{}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return res, fmt.Errorf("failed parsing html: %v", err)
}
doc.Find("tr > td > a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
Warning.Printf("href not found for element %v", s)
return
}
res = append(res, href)
})
return res, nil
}
//doc, err := goquery.NewDocumentFromReader(strings.NewReader(*task.Html))
//if err != nil {
// Error.Printf("failed to parse html: %v", err)
// return
//}
//doc.Find("img").Each(func(i int, s *goquery.Selection) {
// parent := s.Parent()
// if parent.Is("a") {
// href, _ := parent.Attr("href")
// href, ok := fixLink(href)
// if ok {
// hw.output <- &ImageDownloadTask{Url: &href, Topic: task.Topic}
// }
// } else {
// src, _ := s.Attr("src")
// src, ok := fixLink(src)
// if ok {
// hw.output <- &ImageDownloadTask{Url: &src, Topic: task.Topic}
// }
// }
//})