mirror of https://github.com/nmasse-itix/evdb.git
commit
579886d6a4
4 changed files with 267 additions and 0 deletions
@ -0,0 +1,20 @@ |
|||
package main |
|||
|
|||
import ( |
|||
"fmt" |
|||
|
|||
"github.com/nmasse-itix/evdb/ficheauto" |
|||
) |
|||
|
|||
func main() { |
|||
scrapper := ficheauto.NewScrapper("https://www.fiches-auto.fr") |
|||
cars := scrapper.Scrape() |
|||
for _, car := range cars { |
|||
fmt.Println(car) |
|||
} |
|||
// scrapper := ademe.NewScrapper("https://carlabelling.ademe.fr")
|
|||
// cars := scrapper.Scrape()
|
|||
// for _, car := range cars {
|
|||
// fmt.Println(car)
|
|||
// }
|
|||
} |
|||
@ -0,0 +1,174 @@ |
|||
package ficheauto |
|||
|
|||
import ( |
|||
"fmt" |
|||
"regexp" |
|||
"strconv" |
|||
"strings" |
|||
|
|||
"github.com/gocolly/colly" |
|||
) |
|||
|
|||
type Car struct { |
|||
Brand string // Brand name
|
|||
Model string // Model + Variant
|
|||
Weight int // Weight (kg)
|
|||
Layout string // Motor on front, rear wheels or both
|
|||
Power int // Power (ch)
|
|||
Acceleration float32 // From 0 to 100km/h (s)
|
|||
MaxSpeed int // Max speed (km/h)
|
|||
TrunkSpace int // Available space in the trunk (dm3)
|
|||
BatteryCapacity float32 // Battery capacity (kWh)
|
|||
MaxChargingPower int // Maximum charging power (kW)
|
|||
RetailPrice int // Retail price (k€)
|
|||
} |
|||
|
|||
func (car Car) String() string { |
|||
return fmt.Sprintf("%s %s (%d ch, %.1f kWh)", car.Brand, car.Model, car.Power, car.BatteryCapacity) |
|||
} |
|||
|
|||
type Scrapper struct { |
|||
url string |
|||
c *colly.Collector |
|||
} |
|||
|
|||
func NewScrapper(baseUrl string) *Scrapper { |
|||
scrapper := Scrapper{ |
|||
url: fmt.Sprintf("%s/articles-auto/electrique/s-852-comparatif-des-voitures-electriques.php", baseUrl), |
|||
c: colly.NewCollector(), |
|||
} |
|||
return &scrapper |
|||
} |
|||
|
|||
func (s *Scrapper) Scrape() []Car { |
|||
var cars []Car |
|||
|
|||
s.c.OnHTML("table", func(table *colly.HTMLElement) { |
|||
if table.Attr("bordercolor") != "#C0C0C0" { |
|||
return |
|||
} |
|||
table.ForEach("tbody", func(_ int, tbody *colly.HTMLElement) { |
|||
var cols []string = []string{} |
|||
tbody.ForEach("tr", func(i int, tr *colly.HTMLElement) { |
|||
var vals []string = []string{} |
|||
tr.ForEach("td", func(j int, td *colly.HTMLElement) { |
|||
val := strings.TrimSpace(td.Text) |
|||
//fmt.Println(val)
|
|||
if i == 0 { |
|||
cols = append(cols, val) |
|||
} else { |
|||
vals = append(vals, val) |
|||
} |
|||
}) |
|||
if i > 0 { |
|||
data, err := slice2map(cols, vals) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
return |
|||
} |
|||
car, err := map2car(data) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
return |
|||
} |
|||
cars = append(cars, car) |
|||
} |
|||
}) |
|||
}) |
|||
}) |
|||
|
|||
s.c.Visit(s.url) |
|||
|
|||
return cars |
|||
} |
|||
|
|||
var nameRegexp, miscRegexp *regexp.Regexp |
|||
|
|||
func init() { |
|||
nameRegexp = regexp.MustCompile(`^([A-Za-z0-9]+) +(.*) +\(([0-9]+) ?kg\)`) |
|||
miscRegexp = regexp.MustCompile(`^([.0-9]+)`) |
|||
} |
|||
|
|||
func getInt(s string) (int, error) { |
|||
matches := miscRegexp.FindStringSubmatch(s) |
|||
if len(matches) != 2 { |
|||
return 0, fmt.Errorf("wrong number of matches: '%s'", s) |
|||
} |
|||
i, err := strconv.ParseInt(matches[1], 10, 32) |
|||
if err != nil { |
|||
return 0, err |
|||
} |
|||
return int(i), nil |
|||
} |
|||
|
|||
func getFloat(s string) (float32, error) { |
|||
matches := miscRegexp.FindStringSubmatch(s) |
|||
if len(matches) != 2 { |
|||
return 0, fmt.Errorf("wrong number of matches: '%s'", s) |
|||
} |
|||
f, err := strconv.ParseFloat(matches[1], 32) |
|||
if err != nil { |
|||
return 0, err |
|||
} |
|||
return float32(f), nil |
|||
} |
|||
|
|||
func map2car(data map[string]string) (Car, error) { |
|||
var car Car |
|||
matches := nameRegexp.FindStringSubmatch(data["Modèles"]) |
|||
if len(matches) != 4 { |
|||
return Car{}, fmt.Errorf("wrong number of matches: '%s'", data["Modèles"]) |
|||
} |
|||
car.Brand = strings.TrimSpace(matches[1]) |
|||
car.Model = strings.TrimSpace(matches[2]) |
|||
w, err := strconv.ParseInt(matches[3], 10, 32) |
|||
if err != nil { |
|||
return Car{}, err |
|||
} |
|||
car.Weight = int(w) |
|||
car.Layout = data["Motr."] |
|||
car.Acceleration, err = getFloat(data["0/100sec."]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
car.BatteryCapacity, err = getFloat(data["Bat.kWh"]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
car.MaxChargingPower, err = getInt(data["Puiss.ChargeMAX"]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
car.MaxSpeed, err = getInt(data["Vmaxkm/h"]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
car.Power, err = getInt(data["Puiss.ch"]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
car.RetailPrice, err = getInt(data["Prix"]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
car.TrunkSpace, err = getInt(data["CoffreLitres"]) |
|||
if err != nil { |
|||
fmt.Println(err) |
|||
} |
|||
//fmt.Println(data)
|
|||
|
|||
return car, nil |
|||
} |
|||
|
|||
func slice2map(cols, vals []string) (map[string]string, error) { |
|||
if len(cols) != len(vals) { |
|||
return nil, fmt.Errorf("length mismatch") |
|||
} |
|||
|
|||
var ret map[string]string = make(map[string]string, len(cols)) |
|||
for i := range cols { |
|||
ret[cols[i]] = vals[i] |
|||
} |
|||
|
|||
return ret, nil |
|||
} |
|||
@ -0,0 +1,16 @@ |
|||
module github.com/nmasse-itix/evdb |
|||
|
|||
go 1.16 |
|||
|
|||
require ( |
|||
github.com/PuerkitoBio/goquery v1.8.0 // indirect |
|||
github.com/antchfx/htmlquery v1.2.4 // indirect |
|||
github.com/antchfx/xmlquery v1.3.9 // indirect |
|||
github.com/gobwas/glob v0.2.3 // indirect |
|||
github.com/gocolly/colly v1.2.0 |
|||
github.com/kennygrant/sanitize v1.2.4 // indirect |
|||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect |
|||
github.com/temoto/robotstxt v1.1.2 // indirect |
|||
golang.org/x/net v0.0.0-20220225172249-27dd8689420f // indirect |
|||
google.golang.org/appengine v1.6.7 // indirect |
|||
) |
|||
@ -0,0 +1,57 @@ |
|||
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= |
|||
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= |
|||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= |
|||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= |
|||
github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494= |
|||
github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc= |
|||
github.com/antchfx/xmlquery v1.3.9 h1:Y+zyMdiUZ4fasTQTkDb3DflOXP7+obcYEh80SISBmnQ= |
|||
github.com/antchfx/xmlquery v1.3.9/go.mod h1:wojC/BxjEkjJt6dPiAqUzoXO5nIMWtxHS8PD8TmN4ks= |
|||
github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8= |
|||
github.com/antchfx/xpath v1.2.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= |
|||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= |
|||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= |
|||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= |
|||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= |
|||
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= |
|||
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= |
|||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= |
|||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= |
|||
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= |
|||
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= |
|||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= |
|||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= |
|||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= |
|||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= |
|||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= |
|||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= |
|||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= |
|||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= |
|||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= |
|||
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= |
|||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= |
|||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= |
|||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= |
|||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= |
|||
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= |
|||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= |
|||
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= |
|||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= |
|||
golang.org/x/net v0.0.0-20220225172249-27dd8689420f h1:oA4XRj0qtSt8Yo1Zms0CUlsT3KG69V2UGQWPBxujDmc= |
|||
golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= |
|||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= |
|||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
|||
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
|||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
|||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
|||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= |
|||
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= |
|||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= |
|||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= |
|||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= |
|||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= |
|||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= |
|||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= |
|||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= |
|||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= |
|||
google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= |
|||
google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= |
|||
Loading…
Reference in new issue