spyda/internal/crawl_task.go

package internal

import (
	"fmt"
	"time"

	log "github.com/sirupsen/logrus"
)

type CrawlTask struct {
	*BaseTask

	conf    *Config
	db      Store
	indexer Indexer

	url string
}

func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTask {
	return &CrawlTask{
		BaseTask: NewBaseTask(),

		conf:    conf,
		db:      db,
		indexer: indexer,

		url: url,
	}
}

func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
func (t *CrawlTask) Run() error {
	var (
		nLinks   int
		nCrawled int
		nScraped int
		sTime    time.Time
		eTime    time.Time
	)

	sTime = time.Now()
	t.SetState(TaskStateRunning)

	defer func() {
		eTime = time.Now()

		t.SetData("links", fmt.Sprintf("%d", nLinks))
		t.SetData("crawled", fmt.Sprintf("%d", nCrawled))
		t.SetData("scraped", fmt.Sprintf("%d", nScraped))
		t.SetData("start_time", sTime.String())
		t.SetData("eid_time", eTime.String())
		t.SetData("duration", fmt.Sprintf("%0.2f", eTime.Sub(sTime).Seconds()))

		metrics.Gauge("crawler", "duration").Set(float64(eTime.Sub(sTime).Seconds()))

		t.Done()
	}()

	t.SetData("url", t.url)
	log.Infof("starting crawl task for %s", t.url)

	log.Debugf("crawling %s", t.url)

	links, err := GetLinks(t.url)
	if err != nil {
		log.WithError(err).Error("error crawling %s", t.url)
		return t.Fail(fmt.Errorf("error crawling %s: %w", t.url, err))
	}

	for link := range links {
		nLinks++
		hash := HashURL(link)

		if t.db.HasURL(hash) {
			log.Debugf("seen %s (skipping)", link)
			return nil
		}

		log.Debugf("found %s", link)

		nCrawled++
		metrics.Counter("crawler", "crawled").Inc()

		url := NewURL(link)
		url.CrawledAt = time.Now()

		entry, err := Scrape(t.conf, link)
		if err != nil {
			log.WithError(err).Warn("error scraping %s", link)
			continue
		}

		if err := t.indexer.Index(entry); err != nil {
			log.WithError(err).Warn("error indexing %s", link)
			continue
		}

		if err := t.db.SetURL(hash, url); err != nil {
			log.WithError(err).Warn("error recording url %s", link)
		}

		nScraped++
		metrics.Counter("crawler", "scraped").Inc()
	}

	return nil
}