package internal import ( "time" log "github.com/sirupsen/logrus" ) type Crawler interface { Start() Crawl(url string) error } type crawler struct { conf *Config db Store queue chan string indexer Indexer } func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) { return &crawler{ conf: conf, db: db, queue: make(chan string), indexer: indexer, }, nil } func (c *crawler) loop() { for { url := <-c.queue log.Debugf("crawling %s", url) links, err := GetLinks(url) if err != nil { log.WithError(err).Error("error crawling %s", url) continue } for link := range links { hash := HashURL(link) if c.db.HasURL(hash) { log.Debugf("seen %s (skipping)", link) continue } log.Debugf("found %s", link) metrics.Counter("crawler", "crawled").Inc() url := NewURL(link) url.CrawledAt = time.Now() entry, err := Scrape(c.conf, link) if err != nil { log.WithError(err).Error("error scraping %s", link) } else { if err := c.indexer.Index(entry); err != nil { log.WithError(err).Error("error indexing %s", link) } else { if err := c.db.SetURL(hash, url); err != nil { log.WithError(err).Error("error recording url %s", link) } else { metrics.Counter("crawler", "scraped").Inc() } } } } } } func (c *crawler) Crawl(url string) error { c.queue <- url return nil } func (c *crawler) Start() { go c.loop() }