Files
spyda/internal/scraper.go
2022-10-05 11:11:40 +10:00

76 lines
1.7 KiB
Go

package internal
import (
"fmt"
"os"
"path/filepath"
"regexp"
"github.com/apex/log"
readability "github.com/go-shiori/go-readability"
)
const (
cacheDir = "cache"
)
func Scrape(conf *Config, url string) (*Entry, error) {
if url == "" {
return nil, ErrInvalidURL
}
match, err := regexp.MatchString(`^https?://.*`, url)
if err != nil {
log.WithError(err).Error("error parsing url")
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
}
if !match {
return nil, ErrInvalidURL
}
res, err := Request(conf, "GET", url, nil)
if err != nil {
log.WithError(err).Error("error fetching url")
return nil, fmt.Errorf("error fetching url %s: %w", url, err)
}
defer res.Body.Close()
article, err := readability.FromReader(res.Body, url)
if err != nil {
log.WithError(err).Error("error processing url")
return nil, fmt.Errorf("error processing url %s: %w", url, err)
}
entry := &Entry{
URL: url,
Title: article.Title,
Author: article.Byline,
Length: article.Length,
Summary: article.Excerpt,
Content: article.TextContent,
HTMLContent: article.Content,
}
p := filepath.Join(conf.Data, cacheDir)
if err := os.MkdirAll(p, 0755); err != nil {
log.WithError(err).Error("error creating cache directory")
return nil, fmt.Errorf("error creating cache directory: %w", err)
}
fn := filepath.Join(p, fmt.Sprintf("%s.json", entry.Hash()))
data, err := entry.Bytes()
if err != nil {
log.WithError(err).Error("error serializing entry")
return nil, fmt.Errorf("error serializing entry: %s", err)
}
if err := os.WriteFile(fn, data, 0644); err != nil {
log.WithError(err).Error("error persisting entry")
return nil, fmt.Errorf("error persisting entry: %w", err)
}
return entry, nil
}