Record and skip urls the crawler has seen before

This commit is contained in:
James Mills
2021-02-01 23:46:26 +10:00
parent b398a3a709
commit 8a1161cf77
7 changed files with 126 additions and 36 deletions

View File

@@ -14,6 +14,7 @@ const (
feedsKeyPrefix = "/feeds"
sessionsKeyPrefix = "/sessions"
usersKeyPrefix = "/users"
urlsKeyPrefix = "/urls"
tokensKeyPrefix = "/tokens"
)
@@ -67,16 +68,6 @@ func (bs *BitcaskStore) Merge() error {
return nil
}
func (bs *BitcaskStore) HasFeed(name string) bool {
key := []byte(fmt.Sprintf("%s/%s", feedsKeyPrefix, name))
return bs.db.Has(key)
}
func (bs *BitcaskStore) DelFeed(name string) error {
key := []byte(fmt.Sprintf("%s/%s", feedsKeyPrefix, name))
return bs.db.Delete(key)
}
func (bs *BitcaskStore) HasUser(username string) bool {
key := []byte(fmt.Sprintf("%s/%s", usersKeyPrefix, username))
return bs.db.Has(key)
@@ -160,6 +151,72 @@ func (bs *BitcaskStore) GetAllUsers() ([]*User, error) {
return users, nil
}
func (bs *BitcaskStore) HasURL(hash string) bool {
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
return bs.db.Has(key)
}
func (bs *BitcaskStore) DelURL(hash string) error {
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
return bs.db.Delete(key)
}
func (bs *BitcaskStore) GetURL(hash string) (*URL, error) {
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
data, err := bs.db.Get(key)
if err == bitcask.ErrKeyNotFound {
return nil, ErrURLNotFound
}
return LoadURL(data)
}
func (bs *BitcaskStore) SetURL(hash string, url *URL) error {
data, err := url.Bytes()
if err != nil {
return err
}
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
if err := bs.db.Put(key, data); err != nil {
return err
}
return nil
}
func (bs *BitcaskStore) URLCount() int64 {
var count int64
if err := bs.db.Scan([]byte(urlsKeyPrefix), func(_ []byte) error {
count++
return nil
}); err != nil {
log.WithError(err).Error("error scanning")
}
return count
}
func (bs *BitcaskStore) ForEachURL(f func(url *URL) error) error {
err := bs.db.Scan([]byte(urlsKeyPrefix), func(key []byte) error {
data, err := bs.db.Get(key)
if err != nil {
return err
}
url, err := LoadURL(data)
if err != nil {
return err
}
return f(url)
})
if err != nil {
return err
}
return nil
}
func (bs *BitcaskStore) GetSession(sid string) (*session.Session, error) {
key := []byte(fmt.Sprintf("%s/%s", sessionsKeyPrefix, sid))
data, err := bs.db.Get(key)