Files
wowdoc-scraper/main.go
2024-11-06 00:52:48 +01:00

90 lines
2.0 KiB
Go

package main
import (
_ "embed"
"fmt"
"io"
"log"
"os"
"regexp"
)
var rootUrl = `https://wowprogramming.com/`
var apiUrl = fmt.Sprintf("%s/docs/api.html", rootUrl)
var Error *log.Logger
var Warning *log.Logger
func init() {
log.SetFlags(log.Lmicroseconds | log.Lshortfile)
logFile, err := os.Create("main.log")
if err != nil {
log.Printf("Error creating log file: %v", err)
os.Exit(1)
}
logger := io.MultiWriter(os.Stdout, logFile)
log.SetOutput(logger)
Error = log.New(io.MultiWriter(logFile, os.Stderr, os.Stdout),
fmt.Sprintf("%sERROR:%s ", "\033[0;101m", "\033[0m"),
log.Lmicroseconds|log.Lshortfile)
Warning = log.New(io.MultiWriter(logFile, os.Stdout),
fmt.Sprintf("%sWarning:%s ", "\033[0;93m", "\033[0m"),
log.Lmicroseconds|log.Lshortfile)
}
//go:embed test.html
var html string
//go:embed doc.html
var doc string
var pageNameExtractor = regexp.MustCompile(`\/([^/]+).html`)
var outDir = "out"
func main() {
res, err := Fetch(apiUrl)
if err != nil {
Error.Printf("Error fetching %s: %v", apiUrl, err)
return
}
//os.WriteFile("test.html", []byte(res), 0644)
pages, err := ParseHTML(res)
if err != nil {
Error.Printf("Error parsing HTML: %v", err)
return
}
for _, page := range pages {
log.Printf("Processing page %s", page)
err := ParseDocFull(page)
if err != nil {
Error.Printf("Error parsing doc: %v", err)
return
}
}
}
func ParseDocFull(url string) error {
pname := pageNameExtractor.FindStringSubmatch(url)
if len(pname) != 2 {
return fmt.Errorf("Failed to extract page name from %s", url)
}
res, err := Fetch(rootUrl + url)
if err != nil {
return fmt.Errorf("Error fetching %s: %v", rootUrl+url, err)
}
function, err := ParseDoc(res)
if err != nil {
return fmt.Errorf("Error parsing HTML for %s: %v", url, err)
}
function.Name = pname[1]
err = function.WriteFile(outDir)
if err != nil {
return fmt.Errorf("Error writing file for %s: %v", url, err)
}
return nil
}