73 lines
1.2 KiB
Go
73 lines
1.2 KiB
Go
package internal
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"regexp"
|
|
"time"
|
|
|
|
"git.mills.io/prologic/spyda"
|
|
"github.com/gocolly/colly/v2"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
var (
|
|
ErrInvalidURL = errors.New("error: invalid or empty URL")
|
|
)
|
|
|
|
func GetLinks(url string) (linkCh chan string, errCh chan error) {
|
|
linkCh = make(chan string)
|
|
errCh = make(chan error)
|
|
|
|
if url == "" {
|
|
errCh <- ErrInvalidURL
|
|
return
|
|
}
|
|
|
|
match, err := regexp.MatchString(`^https?://.*`, url)
|
|
if err != nil {
|
|
log.WithError(err).Error("error parsing url")
|
|
errCh <- fmt.Errorf("error parsing url %s: %w", url, err)
|
|
return
|
|
}
|
|
|
|
if !match {
|
|
errCh <- ErrInvalidURL
|
|
return
|
|
}
|
|
|
|
scrapedLinks := FindLinks(url)
|
|
|
|
for link := range scrapedLinks {
|
|
linkCh <- link
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func FindLinks(url string) chan string {
|
|
c := colly.NewCollector(
|
|
colly.Async(),
|
|
colly.UserAgent(fmt.Sprintf("%s (+https://spyda.search)", spyda.FullVersion())),
|
|
)
|
|
|
|
// Limit the number of threads started by colly to two
|
|
c.Limit(&colly.LimitRule{
|
|
Parallelism: 2,
|
|
RandomDelay: 5 * time.Second,
|
|
})
|
|
|
|
links := make(chan string)
|
|
|
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
link := e.Request.AbsoluteURL(e.Attr("href"))
|
|
if link != "" {
|
|
links <- link
|
|
}
|
|
})
|
|
|
|
c.Visit(url)
|
|
|
|
return links
|
|
}
|