You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

208 lines
4.6 KiB

package main
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"path"
"strings"
"time"
"github.com/gocolly/colly"
"github.com/spf13/viper"
"github.com/studio-b12/gowebdav"
)
type Scrapper struct {
url string
c *colly.Collector
dav *gowebdav.Client
client *http.Client
davFolder string
davFolderFormat string
}
type ScrapperConfig struct {
ScrapperUrl string
DavUrl string
DavUsername string
DavPassword string
DavFolder string
DavFolderFormat string
HttpTimeout time.Duration
}
func NewScrapper(config ScrapperConfig) (*Scrapper, error) {
dav := gowebdav.NewClient(config.DavUrl, config.DavUsername, config.DavPassword)
scrapper := Scrapper{
url: config.ScrapperUrl,
c: colly.NewCollector(),
dav: dav,
davFolder: config.DavFolder,
davFolderFormat: config.DavFolderFormat,
client: &http.Client{
Timeout: config.HttpTimeout,
},
}
err := scrapper.dav.Connect()
if err != nil {
return nil, err
}
return &scrapper, nil
}
func (s *Scrapper) Scrape() []string {
var url []string
s.c.OnHTML("div[data-zest] a[href]", func(a *colly.HTMLElement) {
href := a.Attr("href")
content := strings.ToLower(a.Text)
if !strings.HasPrefix(href, "//assets.ctfassets.net/") || !strings.Contains(content, "recette") {
return
}
url = append(url, href)
})
s.c.Visit(s.url)
return url
}
func (s *Scrapper) Download(u, filename string) error {
davFolder := path.Join(s.davFolder, time.Now().Format(s.davFolderFormat))
err := s.dav.MkdirAll(davFolder, 0755)
if err != nil {
return err
}
davFilePath := path.Join(davFolder, filename)
_, err = s.dav.Stat(davFilePath)
if err == nil {
log.Printf("File %s already downloaded!", filename)
return nil
}
resp, err := s.client.Get(u)
if err != nil {
return err
}
body := resp.Body
defer body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("Wrong status code: %d", resp.StatusCode)
}
// HEADS UP !
//
// Because of a potential bug with the default Nextcloud configuration,
// the whole file is loaded in memory before being sent over the network.
//
// Long explanation:
//
// The golang net/http library behaves differently depending on the
// implementation behind the io.Reader interface.
//
// * bytes.Reader, strings.Reader and bytes.Buffer: Content-Length is set
// to the size of the content.
//
// * others: no content-length is set and therefore chunked encoding is used.
//
// It looks like the default Nginx configuration for Nextcloud does not like
// chunked encoding...
//
// See https://github.com/photoprism/photoprism/issues/443#issuecomment-685608490
// and https://github.com/studio-b12/gowebdav/issues/35
content, err := ioutil.ReadAll(body)
reader := bytes.NewReader(content)
err = s.dav.WriteStream(davFilePath, reader, 0644)
if err != nil {
return err
}
log.Printf("Downloaded %s", filename)
return nil
}
func initConfig() {
if len(os.Args) != 2 {
fmt.Printf("Usage: %s config.yaml\n", os.Args[0])
os.Exit(1)
}
fd, err := os.Open(os.Args[1])
if err != nil {
fmt.Printf("open: %s: %s\n", os.Args[0], err)
os.Exit(1)
}
defer fd.Close()
viper.SetConfigType("yaml")
err = viper.ReadConfig(fd)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for _, config := range []string{"Scrapper.URL", "WebDAV.URL", "WebDAV.Username", "WebDAV.Password", "WebDAV.Folder", "WebDAV.FolderFormat"} {
if viper.GetString(config) == "" {
fmt.Printf("key %s is missing from configuration file\n", config)
os.Exit(1)
}
}
viper.SetDefault("Scrapper.Timeout", 60*time.Second)
}
func main() {
initConfig()
scrapper, err := NewScrapper(ScrapperConfig{
ScrapperUrl: viper.GetString("Scrapper.URL"),
DavUrl: viper.GetString("WebDAV.URL"),
DavUsername: viper.GetString("WebDAV.Username"),
DavPassword: viper.GetString("WebDAV.Password"),
DavFolder: viper.GetString("WebDAV.Folder"),
DavFolderFormat: viper.GetString("WebDAV.FolderFormat"),
HttpTimeout: viper.GetDuration("Scrapper.Timeout"),
})
if err != nil {
log.Fatal(err)
}
urls := scrapper.Scrape()
fail := false
atLeastOne := false
for _, u := range urls {
parts, err := url.Parse(u)
if err != nil {
log.Printf("Cannot parse URL '%s': %s", u, err)
continue
}
parts.Scheme = "https" // scheme is missing
filename := path.Base(parts.Path)
err = scrapper.Download(parts.String(), filename)
if err != nil {
fail = true
log.Printf("Cannot download file '%s': %s", filename, err)
}
atLeastOne = true
}
if fail || !atLeastOne {
os.Exit(1)
}
os.Exit(0)
}