Add crawler skeleton and add handler
This commit is contained in:
7
go.mod
7
go.mod
@@ -8,13 +8,17 @@ require (
|
||||
github.com/Masterminds/semver v1.5.0 // indirect
|
||||
github.com/Masterminds/sprig v2.22.0+incompatible
|
||||
github.com/NYTimes/gziphandler v1.1.1
|
||||
github.com/PuerkitoBio/goquery v1.6.1 // indirect
|
||||
github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec
|
||||
github.com/antchfx/htmlquery v1.2.3 // indirect
|
||||
github.com/antchfx/xmlquery v1.3.3 // indirect
|
||||
github.com/creasty/defaults v1.5.1
|
||||
github.com/dgrijalva/jwt-go v3.2.0+incompatible
|
||||
github.com/dustin/go-humanize v1.0.0
|
||||
github.com/elithrar/simple-scrypt v1.3.0
|
||||
github.com/gabstv/merger v1.0.1
|
||||
github.com/go-mail/mail v2.3.1+incompatible
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/goccy/go-yaml v1.8.6
|
||||
github.com/gocolly/colly/v2 v2.1.0
|
||||
github.com/gomarkdown/markdown v0.0.0-20201113031856-722100d81a8e
|
||||
@@ -25,6 +29,7 @@ require (
|
||||
github.com/james4k/fmatter v0.0.0-20150827042251-377c8ea6259d
|
||||
github.com/julienschmidt/httprouter v1.3.0
|
||||
github.com/justinas/nosurf v1.1.1
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/mitchellh/copystructure v1.0.0 // indirect
|
||||
github.com/patrickmn/go-cache v2.1.0+incompatible
|
||||
github.com/prologic/bitcask v0.3.10
|
||||
@@ -32,10 +37,12 @@ require (
|
||||
github.com/rainycape/unidecode v0.0.0-20150907023854-cb7f23ec59be // indirect
|
||||
github.com/renstrom/shortuuid v2.0.3+incompatible
|
||||
github.com/robfig/cron v1.2.0
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
||||
github.com/satori/go.uuid v1.2.0 // indirect
|
||||
github.com/sirupsen/logrus v1.7.0
|
||||
github.com/spf13/pflag v1.0.5
|
||||
github.com/steambap/captcha v1.3.1
|
||||
github.com/temoto/robotstxt v1.1.1 // indirect
|
||||
github.com/unrolled/logger v0.0.0-20201216141554-31a3694fe979
|
||||
github.com/vcraescu/go-paginator v1.0.0
|
||||
github.com/wblakecaldwell/profiler v0.0.0-20150908040756-6111ef1313a1
|
||||
|
||||
22
go.sum
22
go.sum
@@ -25,8 +25,8 @@ github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuN
|
||||
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
|
||||
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
|
||||
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
|
||||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
|
||||
github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI=
|
||||
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
|
||||
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
|
||||
@@ -36,16 +36,15 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy
|
||||
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
|
||||
github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec h1:h8ZUCz6pj641NovNuhh/iowIh8yjwtES/Qm61C8lFuM=
|
||||
github.com/andreadipersio/securecookie v0.0.0-20131119095127-e3c3b33544ec/go.mod h1:vX8uUNqOR/LOTwsISi5thUTqArUhyOvn7Tp5/paowwA=
|
||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
|
||||
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
|
||||
github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M=
|
||||
github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
|
||||
github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4=
|
||||
github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM=
|
||||
github.com/antchfx/xmlquery v1.3.3 h1:HYmadPG0uz8CySdL68rB4DCLKXz2PurCjS3mnkVF4CQ=
|
||||
github.com/antchfx/xmlquery v1.3.3/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc=
|
||||
github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
|
||||
github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk=
|
||||
github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
|
||||
github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg=
|
||||
github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
|
||||
github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=
|
||||
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
|
||||
github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
|
||||
@@ -109,8 +108,6 @@ github.com/goccy/go-yaml v1.8.6 h1:xOsXodQ17pkM420Ai0DROYyLbx8FAmX0KhU8MY6ZIg0=
|
||||
github.com/goccy/go-yaml v1.8.6/go.mod h1:U/jl18uSupI5rdI2jmuCswEA2htH9eXfferR3KfscvA=
|
||||
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
|
||||
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
|
||||
github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs=
|
||||
github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0=
|
||||
github.com/gofrs/uuid v3.3.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM=
|
||||
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
|
||||
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
|
||||
@@ -404,8 +401,8 @@ golang.org/x/net v0.0.0-20191119073136-fc4aabc6c914 h1:MlY3mEfbnWGmUi4rtHOtNnnnN
|
||||
golang.org/x/net v0.0.0-20191119073136-fc4aabc6c914/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM=
|
||||
golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc h1:zK/HqS5bZxDptfPJNq8v7vJfXtkU7r9TLIoSr1bXaP4=
|
||||
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
@@ -477,6 +474,7 @@ google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb
|
||||
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
|
||||
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I=
|
||||
google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
|
||||
google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
|
||||
google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
|
||||
|
||||
34
internal/crawler.go
Normal file
34
internal/crawler.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type Crawler interface {
|
||||
Start()
|
||||
Crawl(url string) error
|
||||
}
|
||||
|
||||
type crawler struct {
|
||||
q chan string
|
||||
}
|
||||
|
||||
func NewCrawler() (Crawler, error) {
|
||||
return &crawler{q: make(chan string)}, nil
|
||||
}
|
||||
|
||||
func (c *crawler) loop() {
|
||||
for {
|
||||
url := <-c.q
|
||||
log.Debugf("crawling %s", url)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *crawler) Crawl(url string) error {
|
||||
c.q <- url
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *crawler) Start() {
|
||||
go c.loop()
|
||||
}
|
||||
@@ -103,7 +103,30 @@ func (s *Server) IndexHandler() httprouter.Handle {
|
||||
func (s *Server) AddHandler() httprouter.Handle {
|
||||
return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
|
||||
ctx := NewContext(s.config, s.db, r)
|
||||
s.render("add", w, ctx)
|
||||
|
||||
if r.Method == http.MethodGet {
|
||||
s.render("add", w, ctx)
|
||||
return
|
||||
}
|
||||
|
||||
url := NormalizeURL(strings.TrimSpace(r.FormValue("url")))
|
||||
if url == "" {
|
||||
ctx.Error = true
|
||||
ctx.Message = "Invalid URL"
|
||||
s.render("error", w, ctx)
|
||||
return
|
||||
}
|
||||
|
||||
if err := s.crawler.Crawl(url); err != nil {
|
||||
ctx.Error = true
|
||||
ctx.Message = fmt.Sprintf("Error adding URL: %s", err)
|
||||
s.render("error", w, ctx)
|
||||
return
|
||||
}
|
||||
|
||||
ctx.Error = false
|
||||
ctx.Message = "Successfully added url"
|
||||
s.render("error", w, ctx)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,10 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
URLHashLength = 15
|
||||
)
|
||||
|
||||
// User ...
|
||||
type User struct {
|
||||
Username string
|
||||
@@ -28,6 +32,15 @@ type Token struct {
|
||||
ExpiresAt time.Time
|
||||
}
|
||||
|
||||
// URL ...
|
||||
type URL struct {
|
||||
URL string
|
||||
CrawledAt time.Time
|
||||
ExpiresAt time.Time
|
||||
|
||||
hash string
|
||||
}
|
||||
|
||||
func LoadToken(data []byte) (token *Token, err error) {
|
||||
token = &Token{}
|
||||
if err := defaults.Set(token); err != nil {
|
||||
@@ -99,3 +112,48 @@ func (u *User) Bytes() ([]byte, error) {
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// NewURL ...
|
||||
func NewURL() *URL {
|
||||
u := &URL{}
|
||||
if err := defaults.Set(u); err != nil {
|
||||
log.WithError(err).Error("error creating new URI object")
|
||||
}
|
||||
return u
|
||||
}
|
||||
|
||||
func LoadURL(data []byte) (u *URL, err error) {
|
||||
u = &URL{}
|
||||
if err := defaults.Set(u); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = json.Unmarshal(data, &u); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (u *URL) Hash() string {
|
||||
if u.hash != "" {
|
||||
return u.hash
|
||||
}
|
||||
|
||||
hash := FastHash(u.String())
|
||||
u.hash = hash[len(hash)-URLHashLength:]
|
||||
|
||||
return u.hash
|
||||
}
|
||||
|
||||
func (u *URL) String() string {
|
||||
return u.URL
|
||||
}
|
||||
|
||||
func (u *URL) Bytes() ([]byte, error) {
|
||||
data, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
@@ -41,6 +41,9 @@ type Server struct {
|
||||
router *Router
|
||||
server *http.Server
|
||||
|
||||
// Crawler
|
||||
crawler Crawler
|
||||
|
||||
// Data Store
|
||||
db Store
|
||||
|
||||
@@ -345,6 +348,12 @@ func NewServer(bind string, options ...Option) (*Server, error) {
|
||||
return nil, fmt.Errorf("error validating config: %w", err)
|
||||
}
|
||||
|
||||
crawler, err := NewCrawler()
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error creating crawler")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
db, err := NewStore(config.Store)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error creating store")
|
||||
@@ -406,6 +415,9 @@ func NewServer(bind string, options ...Option) (*Server, error) {
|
||||
// API
|
||||
api: api,
|
||||
|
||||
// Crawler
|
||||
crawler: crawler,
|
||||
|
||||
// Data Store
|
||||
db: db,
|
||||
|
||||
@@ -430,6 +442,9 @@ func NewServer(bind string, options ...Option) (*Server, error) {
|
||||
server.cron.Start()
|
||||
log.Info("started background jobs")
|
||||
|
||||
server.crawler.Start()
|
||||
log.Infof("started crawler")
|
||||
|
||||
server.setupMetrics()
|
||||
log.Infof("serving metrics endpoint at %s/metrics", server.config.BaseURL)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user