Add link crawler
This commit is contained in:
72
internal/links.go
Normal file
72
internal/links.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"git.mills.io/prologic/spyda"
|
||||
"github.com/gocolly/colly/v2"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrInvalidURL = errors.New("error: invalid or empty URL")
|
||||
)
|
||||
|
||||
func GetLinks(url string) (linkCh chan string, errCh chan error) {
|
||||
linkCh = make(chan string)
|
||||
errCh = make(chan error)
|
||||
|
||||
if url == "" {
|
||||
errCh <- ErrInvalidURL
|
||||
return
|
||||
}
|
||||
|
||||
match, err := regexp.MatchString(`^https?://.*`, url)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error parsing url")
|
||||
errCh <- fmt.Errorf("error parsing url %s: %w", url, err)
|
||||
return
|
||||
}
|
||||
|
||||
if !match {
|
||||
errCh <- ErrInvalidURL
|
||||
return
|
||||
}
|
||||
|
||||
scrapedLinks := FindLinks(url)
|
||||
|
||||
for link := range scrapedLinks {
|
||||
linkCh <- link
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func FindLinks(url string) chan string {
|
||||
c := colly.NewCollector(
|
||||
colly.Async(),
|
||||
colly.UserAgent(fmt.Sprintf("%s (+https://spyda.search)", spyda.FullVersion())),
|
||||
)
|
||||
|
||||
// Limit the number of threads started by colly to two
|
||||
c.Limit(&colly.LimitRule{
|
||||
Parallelism: 2,
|
||||
RandomDelay: 5 * time.Second,
|
||||
})
|
||||
|
||||
links := make(chan string)
|
||||
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Request.AbsoluteURL(e.Attr("href"))
|
||||
if link != "" {
|
||||
links <- link
|
||||
}
|
||||
})
|
||||
|
||||
c.Visit(url)
|
||||
|
||||
return links
|
||||
}
|
||||
Reference in New Issue
Block a user