package internal import ( "fmt" "time" log "github.com/sirupsen/logrus" ) type CrawlTask struct { *BaseTask conf *Config db Store indexer Indexer url string } func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTask { return &CrawlTask{ BaseTask: NewBaseTask(), conf: conf, db: db, indexer: indexer, url: url, } } func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) } func (t *CrawlTask) Run() error { var ( nLinks int nCrawled int nScraped int sTime time.Time eTime time.Time ) sTime = time.Now() t.SetState(TaskStateRunning) defer func() { eTime = time.Now() t.SetData("links", fmt.Sprintf("%d", nLinks)) t.SetData("crawled", fmt.Sprintf("%d", nCrawled)) t.SetData("scraped", fmt.Sprintf("%d", nScraped)) t.SetData("start_time", sTime.String()) t.SetData("eid_time", eTime.String()) t.SetData("duration", fmt.Sprintf("%0.2f", eTime.Sub(sTime).Seconds())) metrics.Gauge("crawler", "duration").Set(float64(eTime.Sub(sTime).Seconds())) t.Done() }() t.SetData("url", t.url) log.Infof("starting crawl task for %s", t.url) log.Debugf("crawling %s", t.url) links, err := GetLinks(t.url) if err != nil { log.WithError(err).Error("error crawling %s", t.url) return t.Fail(fmt.Errorf("error crawling %s: %w", t.url, err)) } for link := range links { nLinks++ hash := HashURL(link) if t.db.HasURL(hash) { log.Debugf("seen %s (skipping)", link) return nil } log.Debugf("found %s", link) nCrawled++ metrics.Counter("crawler", "crawled").Inc() url := NewURL(link) url.CrawledAt = time.Now() entry, err := Scrape(t.conf, link) if err != nil { log.WithError(err).Warn("error scraping %s", link) continue } if err := t.indexer.Index(entry); err != nil { log.WithError(err).Warn("error indexing %s", link) continue } if err := t.db.SetURL(hash, url); err != nil { log.WithError(err).Warn("error recording url %s", link) } nScraped++ metrics.Counter("crawler", "scraped").Inc() } return nil }