Record and skip urls the crawler has seen before

This commit is contained in:
James Mills
2021-02-01 23:46:26 +10:00
parent b398a3a709
commit 8a1161cf77
7 changed files with 126 additions and 36 deletions

View File

@@ -8,10 +8,6 @@ import (
log "github.com/sirupsen/logrus"
)
const (
URLHashLength = 15
)
// User ...
type User struct {
Username string
@@ -114,8 +110,8 @@ func (u *User) Bytes() ([]byte, error) {
}
// NewURL ...
func NewURL() *URL {
u := &URL{}
func NewURL(url string) *URL {
u := &URL{URL: url}
if err := defaults.Set(u); err != nil {
log.WithError(err).Error("error creating new URI object")
}
@@ -140,8 +136,7 @@ func (u *URL) Hash() string {
return u.hash
}
hash := FastHash(u.String())
u.hash = hash[len(hash)-URLHashLength:]
u.hash = HashURL(u.String())
return u.hash
}