From 8a1161cf77e26e94e214f12d93c5c7c060f95be4 Mon Sep 17 00:00:00 2001 From: James Mills Date: Mon, 1 Feb 2021 23:46:26 +1000 Subject: [PATCH] Record and skip urls the crawler has seen before --- internal/bitcask_store.go | 77 ++++++++++++++++++++++++++++++++++----- internal/crawler.go | 29 ++++++++++++--- internal/models.go | 11 ++---- internal/scraper.go | 6 ++- internal/server.go | 24 ++++++------ internal/store.go | 8 ++++ internal/utils.go | 7 ++++ 7 files changed, 126 insertions(+), 36 deletions(-) diff --git a/internal/bitcask_store.go b/internal/bitcask_store.go index ec4a606..cc11df3 100644 --- a/internal/bitcask_store.go +++ b/internal/bitcask_store.go @@ -14,6 +14,7 @@ const ( feedsKeyPrefix = "/feeds" sessionsKeyPrefix = "/sessions" usersKeyPrefix = "/users" + urlsKeyPrefix = "/urls" tokensKeyPrefix = "/tokens" ) @@ -67,16 +68,6 @@ func (bs *BitcaskStore) Merge() error { return nil } -func (bs *BitcaskStore) HasFeed(name string) bool { - key := []byte(fmt.Sprintf("%s/%s", feedsKeyPrefix, name)) - return bs.db.Has(key) -} - -func (bs *BitcaskStore) DelFeed(name string) error { - key := []byte(fmt.Sprintf("%s/%s", feedsKeyPrefix, name)) - return bs.db.Delete(key) -} - func (bs *BitcaskStore) HasUser(username string) bool { key := []byte(fmt.Sprintf("%s/%s", usersKeyPrefix, username)) return bs.db.Has(key) @@ -160,6 +151,72 @@ func (bs *BitcaskStore) GetAllUsers() ([]*User, error) { return users, nil } +func (bs *BitcaskStore) HasURL(hash string) bool { + key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash)) + return bs.db.Has(key) +} + +func (bs *BitcaskStore) DelURL(hash string) error { + key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash)) + return bs.db.Delete(key) +} + +func (bs *BitcaskStore) GetURL(hash string) (*URL, error) { + key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash)) + data, err := bs.db.Get(key) + if err == bitcask.ErrKeyNotFound { + return nil, ErrURLNotFound + } + return LoadURL(data) +} + +func (bs *BitcaskStore) SetURL(hash string, url *URL) error { + data, err := url.Bytes() + if err != nil { + return err + } + + key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash)) + if err := bs.db.Put(key, data); err != nil { + return err + } + return nil +} + +func (bs *BitcaskStore) URLCount() int64 { + var count int64 + + if err := bs.db.Scan([]byte(urlsKeyPrefix), func(_ []byte) error { + count++ + return nil + }); err != nil { + log.WithError(err).Error("error scanning") + } + + return count +} + +func (bs *BitcaskStore) ForEachURL(f func(url *URL) error) error { + err := bs.db.Scan([]byte(urlsKeyPrefix), func(key []byte) error { + data, err := bs.db.Get(key) + if err != nil { + return err + } + + url, err := LoadURL(data) + if err != nil { + return err + } + + return f(url) + }) + if err != nil { + return err + } + + return nil +} + func (bs *BitcaskStore) GetSession(sid string) (*session.Session, error) { key := []byte(fmt.Sprintf("%s/%s", sessionsKeyPrefix, sid)) data, err := bs.db.Get(key) diff --git a/internal/crawler.go b/internal/crawler.go index 1e113ba..905ae28 100644 --- a/internal/crawler.go +++ b/internal/crawler.go @@ -1,6 +1,8 @@ package internal import ( + "time" + log "github.com/sirupsen/logrus" ) @@ -11,21 +13,23 @@ type Crawler interface { type crawler struct { conf *Config - q chan string + db Store + queue chan string indexer Indexer } -func NewCrawler(conf *Config, indexer Indexer) (Crawler, error) { +func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) { return &crawler{ conf: conf, - q: make(chan string), + db: db, + queue: make(chan string), indexer: indexer, }, nil } func (c *crawler) loop() { for { - url := <-c.q + url := <-c.queue log.Debugf("crawling %s", url) links, err := GetLinks(url) @@ -35,13 +39,28 @@ func (c *crawler) loop() { } for link := range links { + hash := HashURL(link) + + if c.db.HasURL(hash) { + log.Debugf("seen %s (skipping)", link) + continue + } + log.Debugf("found %s", link) + + url := NewURL(link) + url.CrawledAt = time.Now() + entry, err := Scrape(c.conf, link) if err != nil { log.WithError(err).Error("error scraping %s", link) } else { if err := c.indexer.Index(entry); err != nil { log.WithError(err).Error("error indexing %s", link) + } else { + if err := c.db.SetURL(hash, url); err != nil { + log.WithError(err).Error("error recording url %s", link) + } } } } @@ -49,7 +68,7 @@ func (c *crawler) loop() { } func (c *crawler) Crawl(url string) error { - c.q <- url + c.queue <- url return nil } diff --git a/internal/models.go b/internal/models.go index b8e12be..dba50a3 100644 --- a/internal/models.go +++ b/internal/models.go @@ -8,10 +8,6 @@ import ( log "github.com/sirupsen/logrus" ) -const ( - URLHashLength = 15 -) - // User ... type User struct { Username string @@ -114,8 +110,8 @@ func (u *User) Bytes() ([]byte, error) { } // NewURL ... -func NewURL() *URL { - u := &URL{} +func NewURL(url string) *URL { + u := &URL{URL: url} if err := defaults.Set(u); err != nil { log.WithError(err).Error("error creating new URI object") } @@ -140,8 +136,7 @@ func (u *URL) Hash() string { return u.hash } - hash := FastHash(u.String()) - u.hash = hash[len(hash)-URLHashLength:] + u.hash = HashURL(u.String()) return u.hash } diff --git a/internal/scraper.go b/internal/scraper.go index a274c93..cd61c97 100644 --- a/internal/scraper.go +++ b/internal/scraper.go @@ -10,6 +10,10 @@ import ( readability "github.com/go-shiori/go-readability" ) +const ( + cacheDir = "cache" +) + func Scrape(conf *Config, url string) (*Entry, error) { if url == "" { return nil, ErrInvalidURL @@ -48,7 +52,7 @@ func Scrape(conf *Config, url string) (*Entry, error) { HTMLContent: article.Content, } - fn := filepath.Join(conf.Data, fmt.Sprintf("%s.json", entry.Hash())) + fn := filepath.Join(conf.Data, cacheDir, fmt.Sprintf("%s.json", entry.Hash())) data, err := entry.Bytes() if err != nil { log.WithError(err).Error("error serializing entry") diff --git a/internal/server.go b/internal/server.go index 03909a9..0390bbd 100644 --- a/internal/server.go +++ b/internal/server.go @@ -351,18 +351,6 @@ func NewServer(bind string, options ...Option) (*Server, error) { return nil, fmt.Errorf("error validating config: %w", err) } - indexer, err := NewIndexer(config) - if err != nil { - log.WithError(err).Error("error creating indexer") - return nil, err - } - - crawler, err := NewCrawler(config, indexer) - if err != nil { - log.WithError(err).Error("error creating crawler") - return nil, err - } - db, err := NewStore(config.Store) if err != nil { log.WithError(err).Error("error creating store") @@ -380,6 +368,18 @@ func NewServer(bind string, options ...Option) (*Server, error) { return nil, err } + indexer, err := NewIndexer(config) + if err != nil { + log.WithError(err).Error("error creating indexer") + return nil, err + } + + crawler, err := NewCrawler(config, db, indexer) + if err != nil { + log.WithError(err).Error("error creating crawler") + return nil, err + } + router := NewRouter() am := auth.NewManager(auth.NewOptions("/login", "/register")) diff --git a/internal/store.go b/internal/store.go index e1aa7ef..cafc1f1 100644 --- a/internal/store.go +++ b/internal/store.go @@ -11,6 +11,7 @@ var ( ErrInvalidStore = errors.New("error: invalid store") ErrUserNotFound = errors.New("error: user not found") ErrTokenNotFound = errors.New("error: token not found") + ErrURLNotFound = errors.New("error: url not found") ErrInvalidSession = errors.New("error: invalid session") ) @@ -27,6 +28,13 @@ type Store interface { SearchUsers(prefix string) []string GetAllUsers() ([]*User, error) + DelURL(hash string) error + HasURL(hash string) bool + GetURL(hash string) (*URL, error) + SetURL(hash string, url *URL) error + URLCount() int64 + ForEachURL(f func(*URL) error) error + GetSession(sid string) (*session.Session, error) SetSession(sid string, sess *session.Session) error HasSession(sid string) bool diff --git a/internal/utils.go b/internal/utils.go index ff7b96c..36362d7 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -42,6 +42,8 @@ const ( WeekAgo = DayAgo * 7 MonthAgo = DayAgo * 30 YearAgo = MonthAgo * 12 + + URLHashLength = 15 ) var ( @@ -73,6 +75,11 @@ func FastHash(s string) string { return hash } +func HashURL(url string) string { + hash := FastHash(url) + return hash[len(hash)-URLHashLength:] +} + func IntPow(x, y int) int { return int(math.Pow(float64(x), float64(y))) }