67 lines
1.1 KiB
Go
67 lines
1.1 KiB
Go
package internal
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"regexp"
|
|
"time"
|
|
|
|
"git.mills.io/prologic/spyda"
|
|
"github.com/gocolly/colly/v2"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
var (
|
|
ErrInvalidURL = errors.New("error: invalid or empty URL")
|
|
)
|
|
|
|
func GetLinks(url string) (links chan string, err error) {
|
|
if url == "" {
|
|
return nil, ErrInvalidURL
|
|
}
|
|
|
|
match, err := regexp.MatchString(`^https?://.*`, url)
|
|
if err != nil {
|
|
log.WithError(err).Error("error parsing url")
|
|
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
|
|
}
|
|
|
|
if !match {
|
|
return nil, ErrInvalidURL
|
|
}
|
|
|
|
return FindLinks(url), nil
|
|
}
|
|
|
|
func FindLinks(url string) chan string {
|
|
c := colly.NewCollector(
|
|
colly.Async(),
|
|
colly.UserAgent(fmt.Sprintf("%s (+https://spyda.search)", spyda.FullVersion())),
|
|
)
|
|
|
|
// Limit the number of threads started by colly to two
|
|
c.Limit(&colly.LimitRule{
|
|
Parallelism: 2,
|
|
RandomDelay: 5 * time.Second,
|
|
})
|
|
|
|
links := make(chan string)
|
|
|
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
link := e.Request.AbsoluteURL(e.Attr("href"))
|
|
if link != "" {
|
|
links <- link
|
|
}
|
|
})
|
|
|
|
c.Visit(url)
|
|
|
|
defer func() {
|
|
close(links)
|
|
}()
|
|
|
|
go c.Wait()
|
|
|
|
return links
|
|
}
|