package internal import ( "errors" "fmt" "regexp" "time" "git.mills.io/prologic/spyda" "github.com/gocolly/colly/v2" log "github.com/sirupsen/logrus" ) var ( ErrInvalidURL = errors.New("error: invalid or empty URL") ) func GetLinks(url string) (links chan string, err error) { if url == "" { return nil, ErrInvalidURL } match, err := regexp.MatchString(`^https?://.*`, url) if err != nil { log.WithError(err).Error("error parsing url") return nil, fmt.Errorf("error parsing url %s: %w", url, err) } if !match { return nil, ErrInvalidURL } return FindLinks(url), nil } func FindLinks(url string) chan string { c := colly.NewCollector( colly.Async(), colly.UserAgent(fmt.Sprintf("%s (+https://spyda.search)", spyda.FullVersion())), ) // Limit the number of threads started by colly to two c.Limit(&colly.LimitRule{ Parallelism: 2, RandomDelay: 5 * time.Second, }) links := make(chan string) c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Request.AbsoluteURL(e.Attr("href")) if link != "" { links <- link } }) go func() { links <- url c.Visit(url) c.Wait() close(links) }() return links }