Add crawler skeleton and add handler

This commit is contained in:
James Mills
2021-02-01 21:23:58 +10:00
parent 8c84ee8b3d
commit b69b27eeed
6 changed files with 148 additions and 13 deletions

7
go.mod
View File

@@ -8,13 +8,17 @@ require (
github.com/Masterminds/semver v1.5.0 // indirect
github.com/Masterminds/sprig v2.22.0+incompatible
github.com/NYTimes/gziphandler v1.1.1
github.com/PuerkitoBio/goquery v1.6.1 // indirect
github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec
github.com/antchfx/htmlquery v1.2.3 // indirect
github.com/antchfx/xmlquery v1.3.3 // indirect
github.com/creasty/defaults v1.5.1
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/dustin/go-humanize v1.0.0
github.com/elithrar/simple-scrypt v1.3.0
github.com/gabstv/merger v1.0.1
github.com/go-mail/mail v2.3.1+incompatible
github.com/gobwas/glob v0.2.3 // indirect
github.com/goccy/go-yaml v1.8.6
github.com/gocolly/colly/v2 v2.1.0
github.com/gomarkdown/markdown v0.0.0-20201113031856-722100d81a8e
@@ -25,6 +29,7 @@ require (
github.com/james4k/fmatter v0.0.0-20150827042251-377c8ea6259d
github.com/julienschmidt/httprouter v1.3.0
github.com/justinas/nosurf v1.1.1
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mitchellh/copystructure v1.0.0 // indirect
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/prologic/bitcask v0.3.10
@@ -32,10 +37,12 @@ require (
github.com/rainycape/unidecode v0.0.0-20150907023854-cb7f23ec59be // indirect
github.com/renstrom/shortuuid v2.0.3+incompatible
github.com/robfig/cron v1.2.0
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
github.com/satori/go.uuid v1.2.0 // indirect
github.com/sirupsen/logrus v1.7.0
github.com/spf13/pflag v1.0.5
github.com/steambap/captcha v1.3.1
github.com/temoto/robotstxt v1.1.1 // indirect
github.com/unrolled/logger v0.0.0-20201216141554-31a3694fe979
github.com/vcraescu/go-paginator v1.0.0
github.com/wblakecaldwell/profiler v0.0.0-20150908040756-6111ef1313a1

22
go.sum
View File

@@ -25,8 +25,8 @@ github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuN
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI=
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
@@ -36,16 +36,15 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec h1:h8ZUCz6pj641NovNuhh/iowIh8yjwtES/Qm61C8lFuM=
github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec/go.mod h1:vX8uUNqOR/LOTwsISi5thUTqArUhyOvn7Tp5/paowwA=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M=
github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4=
github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM=
github.com/antchfx/xmlquery v1.3.3 h1:HYmadPG0uz8CySdL68rB4DCLKXz2PurCjS3mnkVF4CQ=
github.com/antchfx/xmlquery v1.3.3/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc=
github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk=
github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg=
github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
@@ -109,8 +108,6 @@ github.com/goccy/go-yaml v1.8.6 h1:xOsXodQ17pkM420Ai0DROYyLbx8FAmX0KhU8MY6ZIg0=
github.com/goccy/go-yaml v1.8.6/go.mod h1:U/jl18uSupI5rdI2jmuCswEA2htH9eXfferR3KfscvA=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs=
github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0=
github.com/gofrs/uuid v3.3.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
@@ -404,8 +401,8 @@ golang.org/x/net v0.0.0-20191119073136-fc4aabc6c914 h1:MlY3mEfbnWGmUi4rtHOtNnnnN
golang.org/x/net v0.0.0-20191119073136-fc4aabc6c914/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM=
golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc h1:zK/HqS5bZxDptfPJNq8v7vJfXtkU7r9TLIoSr1bXaP4=
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -477,6 +474,7 @@ google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I=
google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=

34
internal/crawler.go Normal file
View File

@@ -0,0 +1,34 @@
package internal
import (
log "github.com/sirupsen/logrus"
)
type Crawler interface {
Start()
Crawl(url string) error
}
type crawler struct {
q chan string
}
func NewCrawler() (Crawler, error) {
return &crawler{q: make(chan string)}, nil
}
func (c *crawler) loop() {
for {
url := <-c.q
log.Debugf("crawling %s", url)
}
}
func (c *crawler) Crawl(url string) error {
c.q <- url
return nil
}
func (c *crawler) Start() {
go c.loop()
}

View File

@@ -103,7 +103,30 @@ func (s *Server) IndexHandler() httprouter.Handle {
func (s *Server) AddHandler() httprouter.Handle {
return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
ctx := NewContext(s.config, s.db, r)
s.render("add", w, ctx)
if r.Method == http.MethodGet {
s.render("add", w, ctx)
return
}
url := NormalizeURL(strings.TrimSpace(r.FormValue("url")))
if url == "" {
ctx.Error = true
ctx.Message = "Invalid URL"
s.render("error", w, ctx)
return
}
if err := s.crawler.Crawl(url); err != nil {
ctx.Error = true
ctx.Message = fmt.Sprintf("Error adding URL: %s", err)
s.render("error", w, ctx)
return
}
ctx.Error = false
ctx.Message = "Successfully added url"
s.render("error", w, ctx)
}
}

View File

@@ -8,6 +8,10 @@ import (
log "github.com/sirupsen/logrus"
)
const (
URLHashLength = 15
)
// User ...
type User struct {
Username string
@@ -28,6 +32,15 @@ type Token struct {
ExpiresAt time.Time
}
// URL ...
type URL struct {
URL string
CrawledAt time.Time
ExpiresAt time.Time
hash string
}
func LoadToken(data []byte) (token *Token, err error) {
token = &Token{}
if err := defaults.Set(token); err != nil {
@@ -99,3 +112,48 @@ func (u *User) Bytes() ([]byte, error) {
}
return data, nil
}
// NewURL ...
func NewURL() *URL {
u := &URL{}
if err := defaults.Set(u); err != nil {
log.WithError(err).Error("error creating new URI object")
}
return u
}
func LoadURL(data []byte) (u *URL, err error) {
u = &URL{}
if err := defaults.Set(u); err != nil {
return nil, err
}
if err = json.Unmarshal(data, &u); err != nil {
return nil, err
}
return
}
func (u *URL) Hash() string {
if u.hash != "" {
return u.hash
}
hash := FastHash(u.String())
u.hash = hash[len(hash)-URLHashLength:]
return u.hash
}
func (u *URL) String() string {
return u.URL
}
func (u *URL) Bytes() ([]byte, error) {
data, err := json.Marshal(u)
if err != nil {
return nil, err
}
return data, nil
}

View File

@@ -41,6 +41,9 @@ type Server struct {
router *Router
server *http.Server
// Crawler
crawler Crawler
// Data Store
db Store
@@ -345,6 +348,12 @@ func NewServer(bind string, options ...Option) (*Server, error) {
return nil, fmt.Errorf("error validating config: %w", err)
}
crawler, err := NewCrawler()
if err != nil {
log.WithError(err).Error("error creating crawler")
return nil, err
}
db, err := NewStore(config.Store)
if err != nil {
log.WithError(err).Error("error creating store")
@@ -406,6 +415,9 @@ func NewServer(bind string, options ...Option) (*Server, error) {
// API
api: api,
// Crawler
crawler: crawler,
// Data Store
db: db,
@@ -430,6 +442,9 @@ func NewServer(bind string, options ...Option) (*Server, error) {
server.cron.Start()
log.Info("started background jobs")
server.crawler.Start()
log.Infof("started crawler")
server.setupMetrics()
log.Infof("serving metrics endpoint at %s/metrics", server.config.BaseURL)