Add a /tasks endpoing and extra metdata to crawl tasks
This commit is contained in:
@@ -31,9 +31,31 @@ func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTas
|
||||
|
||||
func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
|
||||
func (t *CrawlTask) Run() error {
|
||||
defer t.Done()
|
||||
var (
|
||||
nLinks int
|
||||
nCrawled int
|
||||
nScraped int
|
||||
sTime time.Time
|
||||
eTime time.Time
|
||||
)
|
||||
|
||||
sTime = time.Now()
|
||||
t.SetState(TaskStateRunning)
|
||||
|
||||
defer func() {
|
||||
eTime = time.Now()
|
||||
|
||||
t.SetData("links", fmt.Sprintf("%d", nLinks))
|
||||
t.SetData("crawled", fmt.Sprintf("%d", nCrawled))
|
||||
t.SetData("scraped", fmt.Sprintf("%d", nScraped))
|
||||
t.SetData("start_time", sTime.String())
|
||||
t.SetData("eid_time", eTime.String())
|
||||
t.SetData("duration", fmt.Sprintf("%0.2f", eTime.Sub(sTime).Seconds()))
|
||||
|
||||
t.Done()
|
||||
}()
|
||||
|
||||
t.SetData("url", t.url)
|
||||
log.Infof("starting crawl task for %s", t.url)
|
||||
|
||||
log.Debugf("crawling %s", t.url)
|
||||
@@ -45,6 +67,7 @@ func (t *CrawlTask) Run() error {
|
||||
}
|
||||
|
||||
for link := range links {
|
||||
nLinks++
|
||||
hash := HashURL(link)
|
||||
|
||||
if t.db.HasURL(hash) {
|
||||
@@ -54,6 +77,7 @@ func (t *CrawlTask) Run() error {
|
||||
|
||||
log.Debugf("found %s", link)
|
||||
|
||||
nCrawled++
|
||||
metrics.Counter("crawler", "crawled").Inc()
|
||||
|
||||
url := NewURL(link)
|
||||
@@ -74,6 +98,7 @@ func (t *CrawlTask) Run() error {
|
||||
log.WithError(err).Warn("error recording url %s", link)
|
||||
}
|
||||
|
||||
nScraped++
|
||||
metrics.Counter("crawler", "scraped").Inc()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user