Add working Crawler, Scraper and Indexer

This commit is contained in:
James Mills
2021-02-01 22:38:17 +10:00
parent 753aff61a1
commit b398a3a709
8 changed files with 328 additions and 21 deletions

View File

@@ -10,17 +10,41 @@ type Crawler interface {
}
type crawler struct {
q chan string
conf *Config
q chan string
indexer Indexer
}
func NewCrawler() (Crawler, error) {
return &crawler{q: make(chan string)}, nil
func NewCrawler(conf *Config, indexer Indexer) (Crawler, error) {
return &crawler{
conf: conf,
q: make(chan string),
indexer: indexer,
}, nil
}
func (c *crawler) loop() {
for {
url := <-c.q
log.Debugf("crawling %s", url)
links, err := GetLinks(url)
if err != nil {
log.WithError(err).Error("error crawling %s", url)
continue
}
for link := range links {
log.Debugf("found %s", link)
entry, err := Scrape(c.conf, link)
if err != nil {
log.WithError(err).Error("error scraping %s", link)
} else {
if err := c.indexer.Index(entry); err != nil {
log.WithError(err).Error("error indexing %s", link)
}
}
}
}
}

56
internal/entry.go Normal file
View File

@@ -0,0 +1,56 @@
package internal
import (
"encoding/json"
"github.com/creasty/defaults"
)
// Entry ...
type Entry struct {
URL string
Title string
Author string
Summary string
Content string
HTMLContent string
Length int
hash string
}
func LoadEntry(data []byte) (entry *Entry, err error) {
entry = &Entry{}
if err := defaults.Set(entry); err != nil {
return nil, err
}
if err = json.Unmarshal(data, &entry); err != nil {
return nil, err
}
return
}
func (e *Entry) String() string {
return e.URL
}
func (e *Entry) Hash() string {
if e.hash != "" {
return e.hash
}
hash := FastHash(e.String())
e.hash = hash[len(hash)-URLHashLength:]
return e.hash
}
func (e *Entry) Bytes() ([]byte, error) {
data, err := json.Marshal(e)
if err != nil {
return nil, err
}
return data, nil
}

42
internal/indexer.go Normal file
View File

@@ -0,0 +1,42 @@
package internal
import (
"path/filepath"
"github.com/apex/log"
"github.com/blevesearch/bleve/v2"
)
type Indexer interface {
Index(entry *Entry) error
}
type indexer struct {
idx bleve.Index
}
func NewIndexer(conf *Config) (Indexer, error) {
var (
idx bleve.Index
err error
)
fn := filepath.Join(conf.Data, "spyda.bleve")
if FileExists(fn) {
idx, err = bleve.Open(fn)
} else {
mapping := bleve.NewIndexMapping()
idx, err = bleve.New(fn, mapping)
}
if err != nil {
log.WithError(err).Error("error creating indexer")
return nil, err
}
return &indexer{idx: idx}, nil
}
func (i *indexer) Index(entry *Entry) error {
return i.idx.Index(entry.Hash(), entry)
}

View File

@@ -15,34 +15,22 @@ var (
ErrInvalidURL = errors.New("error: invalid or empty URL")
)
func GetLinks(url string) (linkCh chan string, errCh chan error) {
linkCh = make(chan string)
errCh = make(chan error)
func GetLinks(url string) (links chan string, err error) {
if url == "" {
errCh <- ErrInvalidURL
return
return nil, ErrInvalidURL
}
match, err := regexp.MatchString(`^https?://.*`, url)
if err != nil {
log.WithError(err).Error("error parsing url")
errCh <- fmt.Errorf("error parsing url %s: %w", url, err)
return
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
}
if !match {
errCh <- ErrInvalidURL
return
return nil, ErrInvalidURL
}
scrapedLinks := FindLinks(url)
for link := range scrapedLinks {
linkCh <- link
}
return
return FindLinks(url), nil
}
func FindLinks(url string) chan string {

64
internal/scraper.go Normal file
View File

@@ -0,0 +1,64 @@
package internal
import (
"fmt"
"io/ioutil"
"path/filepath"
"regexp"
"github.com/apex/log"
readability "github.com/go-shiori/go-readability"
)
func Scrape(conf *Config, url string) (*Entry, error) {
if url == "" {
return nil, ErrInvalidURL
}
match, err := regexp.MatchString(`^https?://.*`, url)
if err != nil {
log.WithError(err).Error("error parsing url")
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
}
if !match {
return nil, ErrInvalidURL
}
res, err := Request(conf, "GET", url, nil)
if err != nil {
log.WithError(err).Error("error fetching url")
return nil, fmt.Errorf("error fetching url %s: %w", url, err)
}
defer res.Body.Close()
article, err := readability.FromReader(res.Body, url)
if err != nil {
log.WithError(err).Error("error processing url")
return nil, fmt.Errorf("error processing url %s: %w", url, err)
}
entry := &Entry{
URL: url,
Title: article.Title,
Author: article.Byline,
Length: article.Length,
Summary: article.Excerpt,
Content: article.TextContent,
HTMLContent: article.Content,
}
fn := filepath.Join(conf.Data, fmt.Sprintf("%s.json", entry.Hash()))
data, err := entry.Bytes()
if err != nil {
log.WithError(err).Error("error serializing entry")
return nil, fmt.Errorf("error serializing entry: %s", err)
}
if err := ioutil.WriteFile(fn, data, 0644); err != nil {
log.WithError(err).Error("error persisting entry")
return nil, fmt.Errorf("error persisting entry: %w", err)
}
return entry, nil
}

View File

@@ -41,6 +41,9 @@ type Server struct {
router *Router
server *http.Server
// Indexer
indexer Indexer
// Crawler
crawler Crawler
@@ -348,7 +351,13 @@ func NewServer(bind string, options ...Option) (*Server, error) {
return nil, fmt.Errorf("error validating config: %w", err)
}
crawler, err := NewCrawler()
indexer, err := NewIndexer(config)
if err != nil {
log.WithError(err).Error("error creating indexer")
return nil, err
}
crawler, err := NewCrawler(config, indexer)
if err != nil {
log.WithError(err).Error("error creating crawler")
return nil, err
@@ -415,6 +424,9 @@ func NewServer(bind string, options ...Option) (*Server, error) {
// API
api: api,
// Indexer
indexer: indexer,
// Crawler
crawler: crawler,