Files
spyda/internal/crawl_task.go
2021-02-02 13:13:12 +10:00

82 lines
1.5 KiB
Go

package internal
import (
"fmt"
"time"
log "github.com/sirupsen/logrus"
)
type CrawlTask struct {
*BaseTask
conf *Config
db Store
indexer Indexer
url string
}
func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTask {
return &CrawlTask{
BaseTask: NewBaseTask(),
conf: conf,
db: db,
indexer: indexer,
url: url,
}
}
func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
func (t *CrawlTask) Run() error {
defer t.Done()
t.SetState(TaskStateRunning)
log.Infof("starting crawl task for %s", t.url)
log.Debugf("crawling %s", t.url)
links, err := GetLinks(t.url)
if err != nil {
log.WithError(err).Error("error crawling %s", t.url)
return t.Fail(fmt.Errorf("error crawling %s: %w", t.url, err))
}
for link := range links {
hash := HashURL(link)
if t.db.HasURL(hash) {
log.Debugf("seen %s (skipping)", link)
return nil
}
log.Debugf("found %s", link)
metrics.Counter("crawler", "crawled").Inc()
url := NewURL(link)
url.CrawledAt = time.Now()
entry, err := Scrape(t.conf, link)
if err != nil {
log.WithError(err).Warn("error scraping %s", link)
continue
}
if err := t.indexer.Index(entry); err != nil {
log.WithError(err).Warn("error indexing %s", link)
continue
}
if err := t.db.SetURL(hash, url); err != nil {
log.WithError(err).Warn("error recording url %s", link)
}
metrics.Counter("crawler", "scraped").Inc()
}
return nil
}