Files
spyda/internal/scraper.go
2021-02-01 22:38:17 +10:00

65 lines
1.5 KiB
Go

package internal
import (
"fmt"
"io/ioutil"
"path/filepath"
"regexp"
"github.com/apex/log"
readability "github.com/go-shiori/go-readability"
)
func Scrape(conf *Config, url string) (*Entry, error) {
if url == "" {
return nil, ErrInvalidURL
}
match, err := regexp.MatchString(`^https?://.*`, url)
if err != nil {
log.WithError(err).Error("error parsing url")
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
}
if !match {
return nil, ErrInvalidURL
}
res, err := Request(conf, "GET", url, nil)
if err != nil {
log.WithError(err).Error("error fetching url")
return nil, fmt.Errorf("error fetching url %s: %w", url, err)
}
defer res.Body.Close()
article, err := readability.FromReader(res.Body, url)
if err != nil {
log.WithError(err).Error("error processing url")
return nil, fmt.Errorf("error processing url %s: %w", url, err)
}
entry := &Entry{
URL: url,
Title: article.Title,
Author: article.Byline,
Length: article.Length,
Summary: article.Excerpt,
Content: article.TextContent,
HTMLContent: article.Content,
}
fn := filepath.Join(conf.Data, fmt.Sprintf("%s.json", entry.Hash()))
data, err := entry.Bytes()
if err != nil {
log.WithError(err).Error("error serializing entry")
return nil, fmt.Errorf("error serializing entry: %s", err)
}
if err := ioutil.WriteFile(fn, data, 0644); err != nil {
log.WithError(err).Error("error persisting entry")
return nil, fmt.Errorf("error persisting entry: %w", err)
}
return entry, nil
}