Refactored crawler to use task dispatcher

This commit is contained in:
James Mills
2021-02-02 13:13:12 +10:00
parent 4970b16d61
commit 26df009e8f
13 changed files with 578 additions and 51 deletions

View File

@@ -1,71 +1,48 @@
package internal
import (
"time"
log "github.com/sirupsen/logrus"
)
type Crawler interface {
Start()
Stop()
Crawl(url string) error
}
type crawler struct {
conf *Config
tasks *Dispatcher
db Store
queue chan string
indexer Indexer
queue chan string
}
func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) {
func NewCrawler(conf *Config, tasks *Dispatcher, db Store, indexer Indexer) (Crawler, error) {
return &crawler{
conf: conf,
tasks: tasks,
db: db,
queue: make(chan string),
indexer: indexer,
queue: make(chan string),
}, nil
}
func (c *crawler) loop() {
for {
url := <-c.queue
log.Debugf("crawling %s", url)
links, err := GetLinks(url)
if err != nil {
log.WithError(err).Error("error crawling %s", url)
continue
}
for link := range links {
hash := HashURL(link)
if c.db.HasURL(hash) {
log.Debugf("seen %s (skipping)", link)
continue
select {
case url, ok := <-c.queue:
if !ok {
log.Debugf("crawler shutting down...")
return
}
log.Debugf("found %s", link)
metrics.Counter("crawler", "crawled").Inc()
url := NewURL(link)
url.CrawledAt = time.Now()
entry, err := Scrape(c.conf, link)
log.Debugf("crawling %s", url)
uuid, err := c.tasks.Dispatch(NewCrawlTask(c.conf, c.db, c.indexer, url))
if err != nil {
log.WithError(err).Error("error scraping %s", link)
log.WithError(err).Error("error creating crawl task for %s", url)
} else {
if err := c.indexer.Index(entry); err != nil {
log.WithError(err).Error("error indexing %s", link)
} else {
if err := c.db.SetURL(hash, url); err != nil {
log.WithError(err).Error("error recording url %s", link)
} else {
metrics.Counter("crawler", "scraped").Inc()
}
}
taskURL := URLForTask(c.conf.BaseURL, uuid)
log.WithField("uuid", uuid).Infof("successfully created crawl task for %s: %s", url, taskURL)
}
}
}
@@ -76,6 +53,10 @@ func (c *crawler) Crawl(url string) error {
return nil
}
func (c *crawler) Stop() {
close(c.queue)
}
func (c *crawler) Start() {
go c.loop()
}