Files
spyda/internal/links.go
2021-02-02 14:35:10 +10:00

66 lines
1.1 KiB
Go

package internal
import (
"errors"
"fmt"
"regexp"
"time"
"git.mills.io/prologic/spyda"
"github.com/gocolly/colly/v2"
log "github.com/sirupsen/logrus"
)
var (
ErrInvalidURL = errors.New("error: invalid or empty URL")
)
func GetLinks(url string) (links chan string, err error) {
if url == "" {
return nil, ErrInvalidURL
}
match, err := regexp.MatchString(`^https?://.*`, url)
if err != nil {
log.WithError(err).Error("error parsing url")
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
}
if !match {
return nil, ErrInvalidURL
}
return FindLinks(url), nil
}
func FindLinks(url string) chan string {
c := colly.NewCollector(
colly.Async(),
colly.UserAgent(fmt.Sprintf("%s (+https://spyda.search)", spyda.FullVersion())),
)
// Limit the number of threads started by colly to two
c.Limit(&colly.LimitRule{
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
links := make(chan string)
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if link != "" {
links <- link
}
})
c.Visit(url)
go func() {
c.Wait()
close(links)
}()
return links
}