package internal import ( "fmt" "time" log "github.com/sirupsen/logrus" ) type CrawlTask struct { *BaseTask conf *Config db Store indexer Indexer url string } func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTask { return &CrawlTask{ BaseTask: NewBaseTask(), conf: conf, db: db, indexer: indexer, url: url, } } func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) } func (t *CrawlTask) Run() error { defer t.Done() t.SetState(TaskStateRunning) log.Infof("starting crawl task for %s", t.url) log.Debugf("crawling %s", t.url) links, err := GetLinks(t.url) if err != nil { log.WithError(err).Error("error crawling %s", t.url) return t.Fail(fmt.Errorf("error crawling %s: %w", t.url, err)) } for link := range links { hash := HashURL(link) if t.db.HasURL(hash) { log.Debugf("seen %s (skipping)", link) return nil } log.Debugf("found %s", link) metrics.Counter("crawler", "crawled").Inc() url := NewURL(link) url.CrawledAt = time.Now() entry, err := Scrape(t.conf, link) if err != nil { log.WithError(err).Warn("error scraping %s", link) continue } if err := t.indexer.Index(entry); err != nil { log.WithError(err).Warn("error indexing %s", link) continue } if err := t.db.SetURL(hash, url); err != nil { log.WithError(err).Warn("error recording url %s", link) } metrics.Counter("crawler", "scraped").Inc() } return nil }