82 lines
1.4 KiB
Go
82 lines
1.4 KiB
Go
package internal
|
|
|
|
import (
|
|
"time"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type Crawler interface {
|
|
Start()
|
|
Crawl(url string) error
|
|
}
|
|
|
|
type crawler struct {
|
|
conf *Config
|
|
db Store
|
|
queue chan string
|
|
indexer Indexer
|
|
}
|
|
|
|
func NewCrawler(conf *Config, db Store, indexer Indexer) (Crawler, error) {
|
|
return &crawler{
|
|
conf: conf,
|
|
db: db,
|
|
queue: make(chan string),
|
|
indexer: indexer,
|
|
}, nil
|
|
}
|
|
|
|
func (c *crawler) loop() {
|
|
for {
|
|
url := <-c.queue
|
|
log.Debugf("crawling %s", url)
|
|
|
|
links, err := GetLinks(url)
|
|
if err != nil {
|
|
log.WithError(err).Error("error crawling %s", url)
|
|
continue
|
|
}
|
|
|
|
for link := range links {
|
|
hash := HashURL(link)
|
|
|
|
if c.db.HasURL(hash) {
|
|
log.Debugf("seen %s (skipping)", link)
|
|
continue
|
|
}
|
|
|
|
log.Debugf("found %s", link)
|
|
|
|
metrics.Counter("crawler", "crawled").Inc()
|
|
|
|
url := NewURL(link)
|
|
url.CrawledAt = time.Now()
|
|
|
|
entry, err := Scrape(c.conf, link)
|
|
if err != nil {
|
|
log.WithError(err).Error("error scraping %s", link)
|
|
} else {
|
|
if err := c.indexer.Index(entry); err != nil {
|
|
log.WithError(err).Error("error indexing %s", link)
|
|
} else {
|
|
if err := c.db.SetURL(hash, url); err != nil {
|
|
log.WithError(err).Error("error recording url %s", link)
|
|
} else {
|
|
metrics.Counter("crawler", "scraped").Inc()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *crawler) Crawl(url string) error {
|
|
c.queue <- url
|
|
return nil
|
|
}
|
|
|
|
func (c *crawler) Start() {
|
|
go c.loop()
|
|
}
|