77 lines
1.8 KiB
Go
77 lines
1.8 KiB
Go
package internal
|
|
|
|
import (
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
|
|
"github.com/apex/log"
|
|
readability "github.com/go-shiori/go-readability"
|
|
)
|
|
|
|
const (
|
|
cacheDir = "cache"
|
|
)
|
|
|
|
func Scrape(conf *Config, url string) (*Entry, error) {
|
|
if url == "" {
|
|
return nil, ErrInvalidURL
|
|
}
|
|
|
|
match, err := regexp.MatchString(`^https?://.*`, url)
|
|
if err != nil {
|
|
log.WithError(err).Error("error parsing url")
|
|
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
|
|
}
|
|
|
|
if !match {
|
|
return nil, ErrInvalidURL
|
|
}
|
|
|
|
res, err := Request(conf, "GET", url, nil)
|
|
if err != nil {
|
|
log.WithError(err).Error("error fetching url")
|
|
return nil, fmt.Errorf("error fetching url %s: %w", url, err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
article, err := readability.FromReader(res.Body, url)
|
|
if err != nil {
|
|
log.WithError(err).Error("error processing url")
|
|
return nil, fmt.Errorf("error processing url %s: %w", url, err)
|
|
}
|
|
|
|
entry := &Entry{
|
|
URL: url,
|
|
Title: article.Title,
|
|
Author: article.Byline,
|
|
Length: article.Length,
|
|
Summary: article.Excerpt,
|
|
Content: article.TextContent,
|
|
HTMLContent: article.Content,
|
|
}
|
|
|
|
p := filepath.Join(conf.Data, cacheDir)
|
|
if err := os.MkdirAll(p, 0755); err != nil {
|
|
log.WithError(err).Error("error creating cache directory")
|
|
return nil, fmt.Errorf("error creating cache directory: %w", err)
|
|
}
|
|
|
|
fn := filepath.Join(p, fmt.Sprintf("%s.json", entry.Hash()))
|
|
|
|
data, err := entry.Bytes()
|
|
if err != nil {
|
|
log.WithError(err).Error("error serializing entry")
|
|
return nil, fmt.Errorf("error serializing entry: %s", err)
|
|
}
|
|
|
|
if err := ioutil.WriteFile(fn, data, 0644); err != nil {
|
|
log.WithError(err).Error("error persisting entry")
|
|
return nil, fmt.Errorf("error persisting entry: %w", err)
|
|
}
|
|
|
|
return entry, nil
|
|
}
|