Add working Crawler, Scraper and Indexer
This commit is contained in:
64
internal/scraper.go
Normal file
64
internal/scraper.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
|
||||
"github.com/apex/log"
|
||||
readability "github.com/go-shiori/go-readability"
|
||||
)
|
||||
|
||||
func Scrape(conf *Config, url string) (*Entry, error) {
|
||||
if url == "" {
|
||||
return nil, ErrInvalidURL
|
||||
}
|
||||
|
||||
match, err := regexp.MatchString(`^https?://.*`, url)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error parsing url")
|
||||
return nil, fmt.Errorf("error parsing url %s: %w", url, err)
|
||||
}
|
||||
|
||||
if !match {
|
||||
return nil, ErrInvalidURL
|
||||
}
|
||||
|
||||
res, err := Request(conf, "GET", url, nil)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error fetching url")
|
||||
return nil, fmt.Errorf("error fetching url %s: %w", url, err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
article, err := readability.FromReader(res.Body, url)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error processing url")
|
||||
return nil, fmt.Errorf("error processing url %s: %w", url, err)
|
||||
}
|
||||
|
||||
entry := &Entry{
|
||||
URL: url,
|
||||
Title: article.Title,
|
||||
Author: article.Byline,
|
||||
Length: article.Length,
|
||||
Summary: article.Excerpt,
|
||||
Content: article.TextContent,
|
||||
HTMLContent: article.Content,
|
||||
}
|
||||
|
||||
fn := filepath.Join(conf.Data, fmt.Sprintf("%s.json", entry.Hash()))
|
||||
data, err := entry.Bytes()
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error serializing entry")
|
||||
return nil, fmt.Errorf("error serializing entry: %s", err)
|
||||
}
|
||||
|
||||
if err := ioutil.WriteFile(fn, data, 0644); err != nil {
|
||||
log.WithError(err).Error("error persisting entry")
|
||||
return nil, fmt.Errorf("error persisting entry: %w", err)
|
||||
}
|
||||
|
||||
return entry, nil
|
||||
}
|
||||
Reference in New Issue
Block a user