Add a /tasks endpoing and extra metdata to crawl tasks

This commit is contained in:
James Mills
2021-02-02 14:58:52 +10:00
parent 5cd5495622
commit 3923ff91ba
4 changed files with 52 additions and 1 deletions

View File

@@ -31,9 +31,31 @@ func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTas
func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
func (t *CrawlTask) Run() error {
defer t.Done()
var (
nLinks int
nCrawled int
nScraped int
sTime time.Time
eTime time.Time
)
sTime = time.Now()
t.SetState(TaskStateRunning)
defer func() {
eTime = time.Now()
t.SetData("links", fmt.Sprintf("%d", nLinks))
t.SetData("crawled", fmt.Sprintf("%d", nCrawled))
t.SetData("scraped", fmt.Sprintf("%d", nScraped))
t.SetData("start_time", sTime.String())
t.SetData("eid_time", eTime.String())
t.SetData("duration", fmt.Sprintf("%0.2f", eTime.Sub(sTime).Seconds()))
t.Done()
}()
t.SetData("url", t.url)
log.Infof("starting crawl task for %s", t.url)
log.Debugf("crawling %s", t.url)
@@ -45,6 +67,7 @@ func (t *CrawlTask) Run() error {
}
for link := range links {
nLinks++
hash := HashURL(link)
if t.db.HasURL(hash) {
@@ -54,6 +77,7 @@ func (t *CrawlTask) Run() error {
log.Debugf("found %s", link)
nCrawled++
metrics.Counter("crawler", "crawled").Inc()
url := NewURL(link)
@@ -74,6 +98,7 @@ func (t *CrawlTask) Run() error {
log.WithError(err).Warn("error recording url %s", link)
}
nScraped++
metrics.Counter("crawler", "scraped").Inc()
}