spyda/internal/crawler.go

package internal

import (
	"time"

	log "github.com/sirupsen/logrus"
)

type Crawler interface {
	Start()
	Crawl(url string) error
}

type crawler struct {
	conf    *Config
	db      Store
	queue   chan string
	indexer Indexer
}

func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) {
	return &crawler{
		conf:    conf,
		db:      db,
		queue:   make(chan string),
		indexer: indexer,
	}, nil
}

func (c *crawler) loop() {
	for {
		url := <-c.queue
		log.Debugf("crawling %s", url)

		links, err := GetLinks(url)
		if err != nil {
			log.WithError(err).Error("error crawling %s", url)
			continue
		}

		for link := range links {
			hash := HashURL(link)

			if c.db.HasURL(hash) {
				log.Debugf("seen %s (skipping)", link)
				continue
			}

			log.Debugf("found %s", link)

			metrics.Counter("crawler", "crawled").Inc()

			url := NewURL(link)
			url.CrawledAt = time.Now()

			entry, err := Scrape(c.conf, link)
			if err != nil {
				log.WithError(err).Error("error scraping %s", link)
			} else {
				if err := c.indexer.Index(entry); err != nil {
					log.WithError(err).Error("error indexing %s", link)
				} else {
					if err := c.db.SetURL(hash, url); err != nil {
						log.WithError(err).Error("error recording url %s", link)
					} else {
						metrics.Counter("crawler", "scraped").Inc()
					}
				}
			}
		}
	}
}

func (c *crawler) Crawl(url string) error {
	c.queue <- url
	return nil
}

func (c *crawler) Start() {
	go c.loop()
}