Record and skip urls the crawler has seen before

This commit is contained in:
James Mills
2021-02-01 23:46:26 +10:00
parent b398a3a709
commit 8a1161cf77
7 changed files with 126 additions and 36 deletions

View File

@@ -10,6 +10,10 @@ import (
readability "github.com/go-shiori/go-readability"
)
const (
cacheDir = "cache"
)
func Scrape(conf *Config, url string) (*Entry, error) {
if url == "" {
return nil, ErrInvalidURL
@@ -48,7 +52,7 @@ func Scrape(conf *Config, url string) (*Entry, error) {
HTMLContent: article.Content,
}
fn := filepath.Join(conf.Data, fmt.Sprintf("%s.json", entry.Hash()))
fn := filepath.Join(conf.Data, cacheDir, fmt.Sprintf("%s.json", entry.Hash()))
data, err := entry.Bytes()
if err != nil {
log.WithError(err).Error("error serializing entry")