Record and skip urls the crawler has seen before

This commit is contained in:
James Mills
2021-02-01 23:46:26 +10:00
parent b398a3a709
commit 8a1161cf77
7 changed files with 126 additions and 36 deletions

View File

@@ -14,6 +14,7 @@ const (
feedsKeyPrefix = "/feeds" feedsKeyPrefix = "/feeds"
sessionsKeyPrefix = "/sessions" sessionsKeyPrefix = "/sessions"
usersKeyPrefix = "/users" usersKeyPrefix = "/users"
urlsKeyPrefix = "/urls"
tokensKeyPrefix = "/tokens" tokensKeyPrefix = "/tokens"
) )
@@ -67,16 +68,6 @@ func (bs *BitcaskStore) Merge() error {
return nil return nil
} }
func (bs *BitcaskStore) HasFeed(name string) bool {
key := []byte(fmt.Sprintf("%s/%s", feedsKeyPrefix, name))
return bs.db.Has(key)
}
func (bs *BitcaskStore) DelFeed(name string) error {
key := []byte(fmt.Sprintf("%s/%s", feedsKeyPrefix, name))
return bs.db.Delete(key)
}
func (bs *BitcaskStore) HasUser(username string) bool { func (bs *BitcaskStore) HasUser(username string) bool {
key := []byte(fmt.Sprintf("%s/%s", usersKeyPrefix, username)) key := []byte(fmt.Sprintf("%s/%s", usersKeyPrefix, username))
return bs.db.Has(key) return bs.db.Has(key)
@@ -160,6 +151,72 @@ func (bs *BitcaskStore) GetAllUsers() ([]*User, error) {
return users, nil return users, nil
} }
func (bs *BitcaskStore) HasURL(hash string) bool {
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
return bs.db.Has(key)
}
func (bs *BitcaskStore) DelURL(hash string) error {
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
return bs.db.Delete(key)
}
func (bs *BitcaskStore) GetURL(hash string) (*URL, error) {
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
data, err := bs.db.Get(key)
if err == bitcask.ErrKeyNotFound {
return nil, ErrURLNotFound
}
return LoadURL(data)
}
func (bs *BitcaskStore) SetURL(hash string, url *URL) error {
data, err := url.Bytes()
if err != nil {
return err
}
key := []byte(fmt.Sprintf("%s/%s", urlsKeyPrefix, hash))
if err := bs.db.Put(key, data); err != nil {
return err
}
return nil
}
func (bs *BitcaskStore) URLCount() int64 {
var count int64
if err := bs.db.Scan([]byte(urlsKeyPrefix), func(_ []byte) error {
count++
return nil
}); err != nil {
log.WithError(err).Error("error scanning")
}
return count
}
func (bs *BitcaskStore) ForEachURL(f func(url *URL) error) error {
err := bs.db.Scan([]byte(urlsKeyPrefix), func(key []byte) error {
data, err := bs.db.Get(key)
if err != nil {
return err
}
url, err := LoadURL(data)
if err != nil {
return err
}
return f(url)
})
if err != nil {
return err
}
return nil
}
func (bs *BitcaskStore) GetSession(sid string) (*session.Session, error) { func (bs *BitcaskStore) GetSession(sid string) (*session.Session, error) {
key := []byte(fmt.Sprintf("%s/%s", sessionsKeyPrefix, sid)) key := []byte(fmt.Sprintf("%s/%s", sessionsKeyPrefix, sid))
data, err := bs.db.Get(key) data, err := bs.db.Get(key)

View File

@@ -1,6 +1,8 @@
package internal package internal
import ( import (
"time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
@@ -11,21 +13,23 @@ type Crawler interface {
type crawler struct { type crawler struct {
conf *Config conf *Config
q chan string db Store
queue chan string
indexer Indexer indexer Indexer
} }
func NewCrawler(conf *Config, indexer Indexer) (Crawler, error) { func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) {
return &crawler{ return &crawler{
conf: conf, conf: conf,
q: make(chan string), db: db,
queue: make(chan string),
indexer: indexer, indexer: indexer,
}, nil }, nil
} }
func (c *crawler) loop() { func (c *crawler) loop() {
for { for {
url := <-c.q url := <-c.queue
log.Debugf("crawling %s", url) log.Debugf("crawling %s", url)
links, err := GetLinks(url) links, err := GetLinks(url)
@@ -35,13 +39,28 @@ func (c *crawler) loop() {
} }
for link := range links { for link := range links {
hash := HashURL(link)
if c.db.HasURL(hash) {
log.Debugf("seen %s (skipping)", link)
continue
}
log.Debugf("found %s", link) log.Debugf("found %s", link)
url := NewURL(link)
url.CrawledAt = time.Now()
entry, err := Scrape(c.conf, link) entry, err := Scrape(c.conf, link)
if err != nil { if err != nil {
log.WithError(err).Error("error scraping %s", link) log.WithError(err).Error("error scraping %s", link)
} else { } else {
if err := c.indexer.Index(entry); err != nil { if err := c.indexer.Index(entry); err != nil {
log.WithError(err).Error("error indexing %s", link) log.WithError(err).Error("error indexing %s", link)
} else {
if err := c.db.SetURL(hash, url); err != nil {
log.WithError(err).Error("error recording url %s", link)
}
} }
} }
} }
@@ -49,7 +68,7 @@ func (c *crawler) loop() {
} }
func (c *crawler) Crawl(url string) error { func (c *crawler) Crawl(url string) error {
c.q <- url c.queue <- url
return nil return nil
} }

View File

@@ -8,10 +8,6 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
const (
URLHashLength = 15
)
// User ... // User ...
type User struct { type User struct {
Username string Username string
@@ -114,8 +110,8 @@ func (u *User) Bytes() ([]byte, error) {
} }
// NewURL ... // NewURL ...
func NewURL() *URL { func NewURL(url string) *URL {
u := &URL{} u := &URL{URL: url}
if err := defaults.Set(u); err != nil { if err := defaults.Set(u); err != nil {
log.WithError(err).Error("error creating new URI object") log.WithError(err).Error("error creating new URI object")
} }
@@ -140,8 +136,7 @@ func (u *URL) Hash() string {
return u.hash return u.hash
} }
hash := FastHash(u.String()) u.hash = HashURL(u.String())
u.hash = hash[len(hash)-URLHashLength:]
return u.hash return u.hash
} }

View File

@@ -10,6 +10,10 @@ import (
readability "github.com/go-shiori/go-readability" readability "github.com/go-shiori/go-readability"
) )
const (
cacheDir = "cache"
)
func Scrape(conf *Config, url string) (*Entry, error) { func Scrape(conf *Config, url string) (*Entry, error) {
if url == "" { if url == "" {
return nil, ErrInvalidURL return nil, ErrInvalidURL
@@ -48,7 +52,7 @@ func Scrape(conf *Config, url string) (*Entry, error) {
HTMLContent: article.Content, HTMLContent: article.Content,
} }
fn := filepath.Join(conf.Data, fmt.Sprintf("%s.json", entry.Hash())) fn := filepath.Join(conf.Data, cacheDir, fmt.Sprintf("%s.json", entry.Hash()))
data, err := entry.Bytes() data, err := entry.Bytes()
if err != nil { if err != nil {
log.WithError(err).Error("error serializing entry") log.WithError(err).Error("error serializing entry")

View File

@@ -351,18 +351,6 @@ func NewServer(bind string, options ...Option) (*Server, error) {
return nil, fmt.Errorf("error validating config: %w", err) return nil, fmt.Errorf("error validating config: %w", err)
} }
indexer, err := NewIndexer(config)
if err != nil {
log.WithError(err).Error("error creating indexer")
return nil, err
}
crawler, err := NewCrawler(config, indexer)
if err != nil {
log.WithError(err).Error("error creating crawler")
return nil, err
}
db, err := NewStore(config.Store) db, err := NewStore(config.Store)
if err != nil { if err != nil {
log.WithError(err).Error("error creating store") log.WithError(err).Error("error creating store")
@@ -380,6 +368,18 @@ func NewServer(bind string, options ...Option) (*Server, error) {
return nil, err return nil, err
} }
indexer, err := NewIndexer(config)
if err != nil {
log.WithError(err).Error("error creating indexer")
return nil, err
}
crawler, err := NewCrawler(config, db, indexer)
if err != nil {
log.WithError(err).Error("error creating crawler")
return nil, err
}
router := NewRouter() router := NewRouter()
am := auth.NewManager(auth.NewOptions("/login", "/register")) am := auth.NewManager(auth.NewOptions("/login", "/register"))

View File

@@ -11,6 +11,7 @@ var (
ErrInvalidStore = errors.New("error: invalid store") ErrInvalidStore = errors.New("error: invalid store")
ErrUserNotFound = errors.New("error: user not found") ErrUserNotFound = errors.New("error: user not found")
ErrTokenNotFound = errors.New("error: token not found") ErrTokenNotFound = errors.New("error: token not found")
ErrURLNotFound = errors.New("error: url not found")
ErrInvalidSession = errors.New("error: invalid session") ErrInvalidSession = errors.New("error: invalid session")
) )
@@ -27,6 +28,13 @@ type Store interface {
SearchUsers(prefix string) []string SearchUsers(prefix string) []string
GetAllUsers() ([]*User, error) GetAllUsers() ([]*User, error)
DelURL(hash string) error
HasURL(hash string) bool
GetURL(hash string) (*URL, error)
SetURL(hash string, url *URL) error
URLCount() int64
ForEachURL(f func(*URL) error) error
GetSession(sid string) (*session.Session, error) GetSession(sid string) (*session.Session, error)
SetSession(sid string, sess *session.Session) error SetSession(sid string, sess *session.Session) error
HasSession(sid string) bool HasSession(sid string) bool

View File

@@ -42,6 +42,8 @@ const (
WeekAgo = DayAgo * 7 WeekAgo = DayAgo * 7
MonthAgo = DayAgo * 30 MonthAgo = DayAgo * 30
YearAgo = MonthAgo * 12 YearAgo = MonthAgo * 12
URLHashLength = 15
) )
var ( var (
@@ -73,6 +75,11 @@ func FastHash(s string) string {
return hash return hash
} }
func HashURL(url string) string {
hash := FastHash(url)
return hash[len(hash)-URLHashLength:]
}
func IntPow(x, y int) int { func IntPow(x, y int) int {
return int(math.Pow(float64(x), float64(y))) return int(math.Pow(float64(x), float64(y)))
} }