package internal import ( "fmt" "io/ioutil" "os" "path/filepath" "regexp" "github.com/apex/log" readability "github.com/go-shiori/go-readability" ) const ( cacheDir = "cache" ) func Scrape(conf *Config, url string) (*Entry, error) { if url == "" { return nil, ErrInvalidURL } match, err := regexp.MatchString(`^https?://.*`, url) if err != nil { log.WithError(err).Error("error parsing url") return nil, fmt.Errorf("error parsing url %s: %w", url, err) } if !match { return nil, ErrInvalidURL } res, err := Request(conf, "GET", url, nil) if err != nil { log.WithError(err).Error("error fetching url") return nil, fmt.Errorf("error fetching url %s: %w", url, err) } defer res.Body.Close() article, err := readability.FromReader(res.Body, url) if err != nil { log.WithError(err).Error("error processing url") return nil, fmt.Errorf("error processing url %s: %w", url, err) } entry := &Entry{ URL: url, Title: article.Title, Author: article.Byline, Length: article.Length, Summary: article.Excerpt, Content: article.TextContent, HTMLContent: article.Content, } p := filepath.Join(conf.Data, cacheDir) if err := os.MkdirAll(p, 0755); err != nil { log.WithError(err).Error("error creating cache directory") return nil, fmt.Errorf("error creating cache directory: %w", err) } fn := filepath.Join(p, fmt.Sprintf("%s.json", entry.Hash())) data, err := entry.Bytes() if err != nil { log.WithError(err).Error("error serializing entry") return nil, fmt.Errorf("error serializing entry: %s", err) } if err := ioutil.WriteFile(fn, data, 0644); err != nil { log.WithError(err).Error("error persisting entry") return nil, fmt.Errorf("error persisting entry: %w", err) } return entry, nil }