107 lines
2.0 KiB
Go
107 lines
2.0 KiB
Go
package internal
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type CrawlTask struct {
|
|
*BaseTask
|
|
|
|
conf *Config
|
|
db Store
|
|
indexer Indexer
|
|
|
|
url string
|
|
}
|
|
|
|
func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTask {
|
|
return &CrawlTask{
|
|
BaseTask: NewBaseTask(),
|
|
|
|
conf: conf,
|
|
db: db,
|
|
indexer: indexer,
|
|
|
|
url: url,
|
|
}
|
|
}
|
|
|
|
func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
|
|
func (t *CrawlTask) Run() error {
|
|
var (
|
|
nLinks int
|
|
nCrawled int
|
|
nScraped int
|
|
sTime time.Time
|
|
eTime time.Time
|
|
)
|
|
|
|
sTime = time.Now()
|
|
t.SetState(TaskStateRunning)
|
|
|
|
defer func() {
|
|
eTime = time.Now()
|
|
|
|
t.SetData("links", fmt.Sprintf("%d", nLinks))
|
|
t.SetData("crawled", fmt.Sprintf("%d", nCrawled))
|
|
t.SetData("scraped", fmt.Sprintf("%d", nScraped))
|
|
t.SetData("start_time", sTime.String())
|
|
t.SetData("eid_time", eTime.String())
|
|
t.SetData("duration", fmt.Sprintf("%0.2f", eTime.Sub(sTime).Seconds()))
|
|
|
|
t.Done()
|
|
}()
|
|
|
|
t.SetData("url", t.url)
|
|
log.Infof("starting crawl task for %s", t.url)
|
|
|
|
log.Debugf("crawling %s", t.url)
|
|
|
|
links, err := GetLinks(t.url)
|
|
if err != nil {
|
|
log.WithError(err).Error("error crawling %s", t.url)
|
|
return t.Fail(fmt.Errorf("error crawling %s: %w", t.url, err))
|
|
}
|
|
|
|
for link := range links {
|
|
nLinks++
|
|
hash := HashURL(link)
|
|
|
|
if t.db.HasURL(hash) {
|
|
log.Debugf("seen %s (skipping)", link)
|
|
return nil
|
|
}
|
|
|
|
log.Debugf("found %s", link)
|
|
|
|
nCrawled++
|
|
metrics.Counter("crawler", "crawled").Inc()
|
|
|
|
url := NewURL(link)
|
|
url.CrawledAt = time.Now()
|
|
|
|
entry, err := Scrape(t.conf, link)
|
|
if err != nil {
|
|
log.WithError(err).Warn("error scraping %s", link)
|
|
continue
|
|
}
|
|
|
|
if err := t.indexer.Index(entry); err != nil {
|
|
log.WithError(err).Warn("error indexing %s", link)
|
|
continue
|
|
}
|
|
|
|
if err := t.db.SetURL(hash, url); err != nil {
|
|
log.WithError(err).Warn("error recording url %s", link)
|
|
}
|
|
|
|
nScraped++
|
|
metrics.Counter("crawler", "scraped").Inc()
|
|
}
|
|
|
|
return nil
|
|
}
|