From b69b27eeedb78000c77c5bc4475f627363fae40c Mon Sep 17 00:00:00 2001 From: James Mills Date: Mon, 1 Feb 2021 21:23:58 +1000 Subject: [PATCH] Add crawler skeleton and add handler --- go.mod | 7 ++++++ go.sum | 22 ++++++++--------- internal/crawler.go | 34 ++++++++++++++++++++++++++ internal/handlers.go | 25 ++++++++++++++++++- internal/models.go | 58 ++++++++++++++++++++++++++++++++++++++++++++ internal/server.go | 15 ++++++++++++ 6 files changed, 148 insertions(+), 13 deletions(-) create mode 100644 internal/crawler.go diff --git a/go.mod b/go.mod index 1218534..c6fe0a3 100644 --- a/go.mod +++ b/go.mod @@ -8,13 +8,17 @@ require ( github.com/Masterminds/semver v1.5.0 // indirect github.com/Masterminds/sprig v2.22.0+incompatible github.com/NYTimes/gziphandler v1.1.1 + github.com/PuerkitoBio/goquery v1.6.1 // indirect github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec + github.com/antchfx/htmlquery v1.2.3 // indirect + github.com/antchfx/xmlquery v1.3.3 // indirect github.com/creasty/defaults v1.5.1 github.com/dgrijalva/jwt-go v3.2.0+incompatible github.com/dustin/go-humanize v1.0.0 github.com/elithrar/simple-scrypt v1.3.0 github.com/gabstv/merger v1.0.1 github.com/go-mail/mail v2.3.1+incompatible + github.com/gobwas/glob v0.2.3 // indirect github.com/goccy/go-yaml v1.8.6 github.com/gocolly/colly/v2 v2.1.0 github.com/gomarkdown/markdown v0.0.0-20201113031856-722100d81a8e @@ -25,6 +29,7 @@ require ( github.com/james4k/fmatter v0.0.0-20150827042251-377c8ea6259d github.com/julienschmidt/httprouter v1.3.0 github.com/justinas/nosurf v1.1.1 + github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mitchellh/copystructure v1.0.0 // indirect github.com/patrickmn/go-cache v2.1.0+incompatible github.com/prologic/bitcask v0.3.10 @@ -32,10 +37,12 @@ require ( github.com/rainycape/unidecode v0.0.0-20150907023854-cb7f23ec59be // indirect github.com/renstrom/shortuuid v2.0.3+incompatible github.com/robfig/cron v1.2.0 + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect github.com/satori/go.uuid v1.2.0 // indirect github.com/sirupsen/logrus v1.7.0 github.com/spf13/pflag v1.0.5 github.com/steambap/captcha v1.3.1 + github.com/temoto/robotstxt v1.1.1 // indirect github.com/unrolled/logger v0.0.0-20201216141554-31a3694fe979 github.com/vcraescu/go-paginator v1.0.0 github.com/wblakecaldwell/profiler v0.0.0-20150908040756-6111ef1313a1 diff --git a/go.sum b/go.sum index 0e23fad..73a39a2 100644 --- a/go.sum +++ b/go.sum @@ -25,8 +25,8 @@ github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuN github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= -github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= -github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk= +github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M= @@ -36,16 +36,15 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec h1:h8ZUCz6pj641NovNuhh/iowIh8yjwtES/Qm61C8lFuM= github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec/go.mod h1:vX8uUNqOR/LOTwsISi5thUTqArUhyOvn7Tp5/paowwA= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= -github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= -github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= -github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= -github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= +github.com/antchfx/xmlquery v1.3.3 h1:HYmadPG0uz8CySdL68rB4DCLKXz2PurCjS3mnkVF4CQ= +github.com/antchfx/xmlquery v1.3.3/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= -github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg= +github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= @@ -109,8 +108,6 @@ github.com/goccy/go-yaml v1.8.6 h1:xOsXodQ17pkM420Ai0DROYyLbx8FAmX0KhU8MY6ZIg0= github.com/goccy/go-yaml v1.8.6/go.mod h1:U/jl18uSupI5rdI2jmuCswEA2htH9eXfferR3KfscvA= github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= -github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= -github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= github.com/gofrs/uuid v3.3.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= @@ -404,8 +401,8 @@ golang.org/x/net v0.0.0-20191119073136-fc4aabc6c914 h1:MlY3mEfbnWGmUi4rtHOtNnnnN golang.org/x/net v0.0.0-20191119073136-fc4aabc6c914/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= -golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM= -golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc h1:zK/HqS5bZxDptfPJNq8v7vJfXtkU7r9TLIoSr1bXaP4= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -477,6 +474,7 @@ google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I= google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= diff --git a/internal/crawler.go b/internal/crawler.go new file mode 100644 index 0000000..26537d4 --- /dev/null +++ b/internal/crawler.go @@ -0,0 +1,34 @@ +package internal + +import ( + log "github.com/sirupsen/logrus" +) + +type Crawler interface { + Start() + Crawl(url string) error +} + +type crawler struct { + q chan string +} + +func NewCrawler() (Crawler, error) { + return &crawler{q: make(chan string)}, nil +} + +func (c *crawler) loop() { + for { + url := <-c.q + log.Debugf("crawling %s", url) + } +} + +func (c *crawler) Crawl(url string) error { + c.q <- url + return nil +} + +func (c *crawler) Start() { + go c.loop() +} diff --git a/internal/handlers.go b/internal/handlers.go index c4b777e..ef08357 100644 --- a/internal/handlers.go +++ b/internal/handlers.go @@ -103,7 +103,30 @@ func (s *Server) IndexHandler() httprouter.Handle { func (s *Server) AddHandler() httprouter.Handle { return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { ctx := NewContext(s.config, s.db, r) - s.render("add", w, ctx) + + if r.Method == http.MethodGet { + s.render("add", w, ctx) + return + } + + url := NormalizeURL(strings.TrimSpace(r.FormValue("url"))) + if url == "" { + ctx.Error = true + ctx.Message = "Invalid URL" + s.render("error", w, ctx) + return + } + + if err := s.crawler.Crawl(url); err != nil { + ctx.Error = true + ctx.Message = fmt.Sprintf("Error adding URL: %s", err) + s.render("error", w, ctx) + return + } + + ctx.Error = false + ctx.Message = "Successfully added url" + s.render("error", w, ctx) } } diff --git a/internal/models.go b/internal/models.go index e2ca288..b8e12be 100644 --- a/internal/models.go +++ b/internal/models.go @@ -8,6 +8,10 @@ import ( log "github.com/sirupsen/logrus" ) +const ( + URLHashLength = 15 +) + // User ... type User struct { Username string @@ -28,6 +32,15 @@ type Token struct { ExpiresAt time.Time } +// URL ... +type URL struct { + URL string + CrawledAt time.Time + ExpiresAt time.Time + + hash string +} + func LoadToken(data []byte) (token *Token, err error) { token = &Token{} if err := defaults.Set(token); err != nil { @@ -99,3 +112,48 @@ func (u *User) Bytes() ([]byte, error) { } return data, nil } + +// NewURL ... +func NewURL() *URL { + u := &URL{} + if err := defaults.Set(u); err != nil { + log.WithError(err).Error("error creating new URI object") + } + return u +} + +func LoadURL(data []byte) (u *URL, err error) { + u = &URL{} + if err := defaults.Set(u); err != nil { + return nil, err + } + + if err = json.Unmarshal(data, &u); err != nil { + return nil, err + } + + return +} + +func (u *URL) Hash() string { + if u.hash != "" { + return u.hash + } + + hash := FastHash(u.String()) + u.hash = hash[len(hash)-URLHashLength:] + + return u.hash +} + +func (u *URL) String() string { + return u.URL +} + +func (u *URL) Bytes() ([]byte, error) { + data, err := json.Marshal(u) + if err != nil { + return nil, err + } + return data, nil +} diff --git a/internal/server.go b/internal/server.go index ff20a8d..abe94f0 100644 --- a/internal/server.go +++ b/internal/server.go @@ -41,6 +41,9 @@ type Server struct { router *Router server *http.Server + // Crawler + crawler Crawler + // Data Store db Store @@ -345,6 +348,12 @@ func NewServer(bind string, options ...Option) (*Server, error) { return nil, fmt.Errorf("error validating config: %w", err) } + crawler, err := NewCrawler() + if err != nil { + log.WithError(err).Error("error creating crawler") + return nil, err + } + db, err := NewStore(config.Store) if err != nil { log.WithError(err).Error("error creating store") @@ -406,6 +415,9 @@ func NewServer(bind string, options ...Option) (*Server, error) { // API api: api, + // Crawler + crawler: crawler, + // Data Store db: db, @@ -430,6 +442,9 @@ func NewServer(bind string, options ...Option) (*Server, error) { server.cron.Start() log.Info("started background jobs") + server.crawler.Start() + log.Infof("started crawler") + server.setupMetrics() log.Infof("serving metrics endpoint at %s/metrics", server.config.BaseURL)