Files
wowdoc-scraper/html-parser.go

111 lines
2.4 KiB
Go

package main
import (
"fmt"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/davecgh/go-spew/spew"
)
func ParseHTML(html string) ([]string, error) {
res := []string{}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return res, fmt.Errorf("failed parsing html: %v", err)
}
log.Printf("Looking for links in %s", html)
doc.Find("tr > td > a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
Warning.Printf("href not found for element %v", s)
return
}
res = append(res, href)
})
log.Printf("Found %d links", len(res))
return res, nil
}
func ParseDoc(html string) (Function, error) {
res := Function{}
log.Printf("Parsing doc %s", html)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return res, fmt.Errorf("failed parsing html: %v", err)
}
isArgs := false
isReturn := false
doc.Find("div.api-listing > p, div.api-listing > ul").Each(func(i int, s *goquery.Selection) {
if s.Is("p") {
switch s.Text() {
case "Arguments:":
isArgs = true
isReturn = false
return
case "Returns:":
isReturn = true
isArgs = false
return
default:
Warning.Printf("Unknown p tag: %s", s.Text())
return
}
}
if s.Is("ul") {
params, err := parseUl(s)
if err != nil {
Error.Printf("Error parsing ul %s: %v", s.Text(), err)
return
}
if isArgs {
res.Arguments = params
} else if isReturn {
res.Returns = params
}
}
})
spew.Dump(res)
return res, nil
}
func parseUl(ul *goquery.Selection) ([]Parameter, error) {
res := []Parameter{}
ul.Find("li").Each(func(i int, s *goquery.Selection) {
log.Printf("Parsing li %s", s.Text())
param := Parameter{}
codes := s.Find("code")
if codes.Length() == 0 {
Warning.Printf("No code found for %s", s.Text())
return
}
code := codes.First()
name := code.Text()
if name == "" {
Warning.Printf("No name found for %s", s.Text())
return
}
param.Name = name
if codes.Length() > 1 {
code := codes.Last()
typ := code.Text()
if typ == "" {
Warning.Printf("No type found for %s", s.Text())
return
}
param.Type = typ
}
log.Printf("Found param %+v", param)
res = append(res, param)
})
return res, nil
}