Add a /tasks endpoing and extra metdata to crawl tasks
This commit is contained in:
@@ -31,9 +31,31 @@ func NewCrawlTask(conf *Config, db Store, indexer Indexer, url string) *CrawlTas
|
|||||||
|
|
||||||
func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
|
func (t *CrawlTask) String() string { return fmt.Sprintf("%T: %s", t, t.ID()) }
|
||||||
func (t *CrawlTask) Run() error {
|
func (t *CrawlTask) Run() error {
|
||||||
defer t.Done()
|
var (
|
||||||
|
nLinks int
|
||||||
|
nCrawled int
|
||||||
|
nScraped int
|
||||||
|
sTime time.Time
|
||||||
|
eTime time.Time
|
||||||
|
)
|
||||||
|
|
||||||
|
sTime = time.Now()
|
||||||
t.SetState(TaskStateRunning)
|
t.SetState(TaskStateRunning)
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
eTime = time.Now()
|
||||||
|
|
||||||
|
t.SetData("links", fmt.Sprintf("%d", nLinks))
|
||||||
|
t.SetData("crawled", fmt.Sprintf("%d", nCrawled))
|
||||||
|
t.SetData("scraped", fmt.Sprintf("%d", nScraped))
|
||||||
|
t.SetData("start_time", sTime.String())
|
||||||
|
t.SetData("eid_time", eTime.String())
|
||||||
|
t.SetData("duration", fmt.Sprintf("%0.2f", eTime.Sub(sTime).Seconds()))
|
||||||
|
|
||||||
|
t.Done()
|
||||||
|
}()
|
||||||
|
|
||||||
|
t.SetData("url", t.url)
|
||||||
log.Infof("starting crawl task for %s", t.url)
|
log.Infof("starting crawl task for %s", t.url)
|
||||||
|
|
||||||
log.Debugf("crawling %s", t.url)
|
log.Debugf("crawling %s", t.url)
|
||||||
@@ -45,6 +67,7 @@ func (t *CrawlTask) Run() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for link := range links {
|
for link := range links {
|
||||||
|
nLinks++
|
||||||
hash := HashURL(link)
|
hash := HashURL(link)
|
||||||
|
|
||||||
if t.db.HasURL(hash) {
|
if t.db.HasURL(hash) {
|
||||||
@@ -54,6 +77,7 @@ func (t *CrawlTask) Run() error {
|
|||||||
|
|
||||||
log.Debugf("found %s", link)
|
log.Debugf("found %s", link)
|
||||||
|
|
||||||
|
nCrawled++
|
||||||
metrics.Counter("crawler", "crawled").Inc()
|
metrics.Counter("crawler", "crawled").Inc()
|
||||||
|
|
||||||
url := NewURL(link)
|
url := NewURL(link)
|
||||||
@@ -74,6 +98,7 @@ func (t *CrawlTask) Run() error {
|
|||||||
log.WithError(err).Warn("error recording url %s", link)
|
log.WithError(err).Warn("error recording url %s", link)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nScraped++
|
||||||
metrics.Counter("crawler", "scraped").Inc()
|
metrics.Counter("crawler", "scraped").Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -77,6 +77,15 @@ func (d *Dispatcher) Stop() {
|
|||||||
d.quit <- true
|
d.quit <- true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Tasks returns all tasks
|
||||||
|
func (d *Dispatcher) Tasks() map[string]TaskResult {
|
||||||
|
tasks := make(map[string]TaskResult)
|
||||||
|
for id, task := range d.taskMap {
|
||||||
|
tasks[id] = task.Result()
|
||||||
|
}
|
||||||
|
return tasks
|
||||||
|
}
|
||||||
|
|
||||||
// Lookup returns the matching `Task` given its id
|
// Lookup returns the matching `Task` given its id
|
||||||
func (d *Dispatcher) Lookup(id string) (Task, bool) {
|
func (d *Dispatcher) Lookup(id string) (Task, bool) {
|
||||||
task, ok := d.taskMap[id]
|
task, ok := d.taskMap[id]
|
||||||
|
|||||||
@@ -306,6 +306,7 @@ func (s *Server) initRoutes() {
|
|||||||
s.router.POST("/chpasswd", s.NewPasswordHandler())
|
s.router.POST("/chpasswd", s.NewPasswordHandler())
|
||||||
|
|
||||||
// Task State
|
// Task State
|
||||||
|
s.router.GET("/tasks", s.TasksHandler())
|
||||||
s.router.GET("/task/:uuid", s.TaskHandler())
|
s.router.GET("/task/:uuid", s.TaskHandler())
|
||||||
|
|
||||||
s.router.GET("/add", s.AddHandler())
|
s.router.GET("/add", s.AddHandler())
|
||||||
|
|||||||
@@ -8,6 +8,22 @@ import (
|
|||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// TasksHandler ...
|
||||||
|
func (s *Server) TasksHandler() httprouter.Handle {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
|
||||||
|
tasks := s.tasks.Tasks()
|
||||||
|
|
||||||
|
data, err := json.Marshal(tasks)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
_, _ = w.Write(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TaskHandler ...
|
// TaskHandler ...
|
||||||
func (s *Server) TaskHandler() httprouter.Handle {
|
func (s *Server) TaskHandler() httprouter.Handle {
|
||||||
return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
|
return func(w http.ResponseWriter, r *http.Request, p httprouter.Params) {
|
||||||
Reference in New Issue
Block a user