Refactored crawler to use task dispatcher
This commit is contained in:
@@ -1,71 +1,48 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type Crawler interface {
|
||||
Start()
|
||||
Stop()
|
||||
Crawl(url string) error
|
||||
}
|
||||
|
||||
type crawler struct {
|
||||
conf *Config
|
||||
tasks *Dispatcher
|
||||
db Store
|
||||
queue chan string
|
||||
indexer Indexer
|
||||
queue chan string
|
||||
}
|
||||
|
||||
func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) {
|
||||
func NewCrawler(conf *Config, tasks *Dispatcher, db Store, indexer Indexer) (Crawler, error) {
|
||||
return &crawler{
|
||||
conf: conf,
|
||||
tasks: tasks,
|
||||
db: db,
|
||||
queue: make(chan string),
|
||||
indexer: indexer,
|
||||
queue: make(chan string),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *crawler) loop() {
|
||||
for {
|
||||
url := <-c.queue
|
||||
log.Debugf("crawling %s", url)
|
||||
|
||||
links, err := GetLinks(url)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error crawling %s", url)
|
||||
continue
|
||||
}
|
||||
|
||||
for link := range links {
|
||||
hash := HashURL(link)
|
||||
|
||||
if c.db.HasURL(hash) {
|
||||
log.Debugf("seen %s (skipping)", link)
|
||||
continue
|
||||
select {
|
||||
case url, ok := <-c.queue:
|
||||
if !ok {
|
||||
log.Debugf("crawler shutting down...")
|
||||
return
|
||||
}
|
||||
|
||||
log.Debugf("found %s", link)
|
||||
|
||||
metrics.Counter("crawler", "crawled").Inc()
|
||||
|
||||
url := NewURL(link)
|
||||
url.CrawledAt = time.Now()
|
||||
|
||||
entry, err := Scrape(c.conf, link)
|
||||
log.Debugf("crawling %s", url)
|
||||
uuid, err := c.tasks.Dispatch(NewCrawlTask(c.conf, c.db, c.indexer, url))
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error scraping %s", link)
|
||||
log.WithError(err).Error("error creating crawl task for %s", url)
|
||||
} else {
|
||||
if err := c.indexer.Index(entry); err != nil {
|
||||
log.WithError(err).Error("error indexing %s", link)
|
||||
} else {
|
||||
if err := c.db.SetURL(hash, url); err != nil {
|
||||
log.WithError(err).Error("error recording url %s", link)
|
||||
} else {
|
||||
metrics.Counter("crawler", "scraped").Inc()
|
||||
}
|
||||
}
|
||||
taskURL := URLForTask(c.conf.BaseURL, uuid)
|
||||
log.WithField("uuid", uuid).Infof("successfully created crawl task for %s: %s", url, taskURL)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -76,6 +53,10 @@ func (c *crawler) Crawl(url string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *crawler) Stop() {
|
||||
close(c.queue)
|
||||
}
|
||||
|
||||
func (c *crawler) Start() {
|
||||
go c.loop()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user