package internal import ( "errors" "fmt" "regexp" "time" "git.mills.io/prologic/spyda" "github.com/gocolly/colly/v2" log "github.com/sirupsen/logrus" ) var ( ErrInvalidURL = errors.New("error: invalid or empty URL") ) func GetLinks(url string) (linkCh chan string, errCh chan error) { linkCh = make(chan string) errCh = make(chan error) if url == "" { errCh <- ErrInvalidURL return } match, err := regexp.MatchString(`^https?://.*`, url) if err != nil { log.WithError(err).Error("error parsing url") errCh <- fmt.Errorf("error parsing url %s: %w", url, err) return } if !match { errCh <- ErrInvalidURL return } scrapedLinks := FindLinks(url) for link := range scrapedLinks { linkCh <- link } return } func FindLinks(url string) chan string { c := colly.NewCollector( colly.Async(), colly.UserAgent(fmt.Sprintf("%s (+https://spyda.search)", spyda.FullVersion())), ) // Limit the number of threads started by colly to two c.Limit(&colly.LimitRule{ Parallelism: 2, RandomDelay: 5 * time.Second, }) links := make(chan string) c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Request.AbsoluteURL(e.Attr("href")) if link != "" { links <- link } }) c.Visit(url) return links }