You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
438 lines
10 KiB
438 lines
10 KiB
package github
|
|
|
|
import (
|
|
"bufio"
|
|
"compress/gzip"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v4"
|
|
)
|
|
|
|
type GithubEvent struct {
|
|
Id string `json:"id"`
|
|
Type string `json:"type"`
|
|
Repository GithubRepository `json:"repo"`
|
|
Timestamp time.Time `json:"created_at"`
|
|
Payload GithubEventPayload `json:"payload"`
|
|
}
|
|
|
|
type GithubRepository struct {
|
|
Id int64 `json:"id"`
|
|
Name string `json:"name"`
|
|
}
|
|
|
|
type GithubEventPayload struct {
|
|
Action string `json:"action"`
|
|
Release ReleaseEventPayload `json:"release"`
|
|
Forkee ForkEventPayload `json:"forkee"`
|
|
Issue IssueEventPayload `json:"issue"`
|
|
PullRequest PullRequestEventPayload `json:"pull_request"`
|
|
}
|
|
|
|
type IssueEventPayload struct {
|
|
Id int64 `json:"id"`
|
|
Number int `json:"number"`
|
|
}
|
|
|
|
type PullRequestEventPayload struct {
|
|
Id int64 `json:"id"`
|
|
Number int `json:"number"`
|
|
}
|
|
|
|
type ForkEventPayload struct {
|
|
Id int64 `json:"id"`
|
|
Name string `json:"full_name"`
|
|
}
|
|
|
|
type ReleaseEventPayload struct {
|
|
Id int64 `json:"id"`
|
|
TagName string `json:"tag_name"`
|
|
}
|
|
|
|
func (evt *GithubEvent) String() string {
|
|
switch {
|
|
case evt.Type == "CreateEvent":
|
|
return fmt.Sprintf("%s: Repository(%s)",
|
|
evt.Timestamp.Format("2006-01-02"),
|
|
evt.Repository.Name)
|
|
case evt.Type == "WatchEvent":
|
|
return fmt.Sprintf("%s: Star(%s)",
|
|
evt.Timestamp.Format("2006-01-02"),
|
|
evt.Repository.Name)
|
|
case evt.Type == "PullRequestEvent":
|
|
return fmt.Sprintf("%s: PR(%s, %d)",
|
|
evt.Timestamp.Format("2006-01-02"),
|
|
evt.Repository.Name,
|
|
evt.Payload.PullRequest.Number)
|
|
case evt.Type == "ForkEvent":
|
|
return fmt.Sprintf("%s: Fork(%s)",
|
|
evt.Timestamp.Format("2006-01-02"),
|
|
evt.Repository.Name)
|
|
case evt.Type == "IssuesEvent":
|
|
return fmt.Sprintf("%s: Issue(%s, %d)",
|
|
evt.Timestamp.Format("2006-01-02"),
|
|
evt.Repository.Name,
|
|
evt.Payload.Issue.Number)
|
|
case evt.Type == "ReleaseEvent":
|
|
return fmt.Sprintf("%s: Release(%s, %s)",
|
|
evt.Timestamp.Format("2006-01-02"),
|
|
evt.Repository.Name,
|
|
evt.Payload.Release.TagName)
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (evt *GithubEvent) IsRelevant() bool {
|
|
switch {
|
|
case evt.Type == "CreateEvent":
|
|
return true
|
|
case evt.Type == "WatchEvent":
|
|
return true
|
|
case evt.Type == "PullRequestEvent":
|
|
return evt.Payload.Action == "opened"
|
|
case evt.Type == "ForkEvent":
|
|
return true
|
|
case evt.Type == "IssuesEvent":
|
|
return evt.Payload.Action == "opened"
|
|
case evt.Type == "ReleaseEvent":
|
|
return evt.Payload.Action == "released" || evt.Payload.Action == "published"
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
const (
|
|
CreateEventQuery string = `
|
|
INSERT INTO github_repository (id, name)
|
|
VALUES ($1, $2)
|
|
ON CONFLICT (id) DO NOTHING`
|
|
|
|
WatchEventQuery string = `
|
|
INSERT INTO github_repository (id, name, stars)
|
|
VALUES ($1, $2, 1)
|
|
ON CONFLICT (id)
|
|
DO UPDATE SET stars = github_repository.stars + 1`
|
|
|
|
PullRequestEventQuery string = `
|
|
INSERT INTO github_repository (id, name, pull_requests)
|
|
VALUES ($1, $2, 1)
|
|
ON CONFLICT (id)
|
|
DO UPDATE SET pull_requests = github_repository.pull_requests + 1`
|
|
|
|
IssuesEventQuery string = `
|
|
INSERT INTO github_repository (id, name, issues)
|
|
VALUES ($1, $2, 1)
|
|
ON CONFLICT (id)
|
|
DO UPDATE SET issues = github_repository.issues + 1`
|
|
|
|
ForkEventQueryParent string = `
|
|
INSERT INTO github_repository (id, name, forks)
|
|
VALUES ($1, $2, 1)
|
|
ON CONFLICT (id)
|
|
DO UPDATE SET forks = github_repository.forks + 1`
|
|
|
|
ForkEventQueryChild string = `
|
|
INSERT INTO github_repository (id, name, forked_from)
|
|
VALUES ($1, $2, $3)
|
|
ON CONFLICT (id)
|
|
DO NOTHING`
|
|
|
|
ReleaseEventQueryRepo string = `
|
|
INSERT INTO github_repository (id, name)
|
|
VALUES ($1, $2)
|
|
ON CONFLICT (id)
|
|
DO NOTHING`
|
|
|
|
ReleaseEventQueryRelease string = `
|
|
INSERT INTO github_release (id, repo_id, timestamp, tag_name)
|
|
VALUES ($1, $2, $3, $4)
|
|
ON CONFLICT (id)
|
|
DO UPDATE SET timestamp = $3, tag_name = $4`
|
|
|
|
PersistStateQuery string = `
|
|
INSERT INTO github_event_package (id, last_event, done, last_error)
|
|
VALUES ($1, $2, $3, $4)
|
|
ON CONFLICT (id)
|
|
DO UPDATE SET last_event = $2, done = $3, last_error = $4`
|
|
|
|
GetStateQuery string = `
|
|
SELECT id, last_event, done, last_error
|
|
FROM github_event_package
|
|
WHERE id = $1`
|
|
|
|
GetNextPackageQuery string = `
|
|
SELECT id, last_event, done, last_error
|
|
FROM github_event_package
|
|
WHERE NOT done
|
|
ORDER BY id ASC
|
|
LIMIT 1`
|
|
|
|
GetLastPackageQuery string = `
|
|
SELECT id, last_event, done, last_error
|
|
FROM github_event_package
|
|
ORDER BY id DESC
|
|
LIMIT 1`
|
|
|
|
MaxTokenSize int = 10 * 1024 * 1024
|
|
|
|
BackoffDelay time.Duration = 60 * time.Second
|
|
)
|
|
|
|
type GithubPackage struct {
|
|
ID time.Time
|
|
LastEvent string
|
|
Done bool
|
|
LastError string
|
|
}
|
|
|
|
func NewGithubPackage(db *pgx.Conn, id time.Time) (GithubPackage, error) {
|
|
rows, err := db.Query(context.Background(), GetStateQuery, id)
|
|
if err != nil {
|
|
return GithubPackage{}, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
if rows.Next() {
|
|
var pkg GithubPackage
|
|
err = rows.Scan(&pkg.ID, &pkg.LastEvent, &pkg.Done, &pkg.LastError)
|
|
if err != nil {
|
|
return GithubPackage{}, err
|
|
}
|
|
return pkg, nil
|
|
}
|
|
|
|
return GithubPackage{ID: id}, nil
|
|
}
|
|
|
|
func NextGithubPackage(db *pgx.Conn) (GithubPackage, error) {
|
|
rows, err := db.Query(context.Background(), GetNextPackageQuery)
|
|
if err != nil {
|
|
return GithubPackage{}, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
if rows.Next() {
|
|
var pkg GithubPackage
|
|
err = rows.Scan(&pkg.ID, &pkg.LastEvent, &pkg.Done, &pkg.LastError)
|
|
if err != nil {
|
|
return GithubPackage{}, err
|
|
}
|
|
return pkg, nil
|
|
}
|
|
rows.Close()
|
|
|
|
rows, err = db.Query(context.Background(), GetLastPackageQuery)
|
|
if err != nil {
|
|
return GithubPackage{}, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
if rows.Next() {
|
|
var pkg GithubPackage
|
|
err = rows.Scan(&pkg.ID, &pkg.LastEvent, &pkg.Done, &pkg.LastError)
|
|
if err != nil {
|
|
return GithubPackage{}, err
|
|
}
|
|
return pkg.Next(), nil
|
|
}
|
|
|
|
return GithubPackage{ID: time.Date(2015, time.January, 1, 0, 0, 0, 0, time.UTC)}, nil
|
|
}
|
|
|
|
func (pkg *GithubPackage) Next() GithubPackage {
|
|
var newPkg GithubPackage
|
|
newPkg.ID = pkg.ID.Add(time.Hour)
|
|
return newPkg
|
|
}
|
|
|
|
func (pkg *GithubPackage) Persist(db *pgx.Conn) error {
|
|
rows, err := db.Query(context.Background(), PersistStateQuery,
|
|
pkg.ID,
|
|
pkg.LastEvent,
|
|
pkg.Done,
|
|
pkg.LastError)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
return nil
|
|
}
|
|
|
|
func (pkg *GithubPackage) Import(db *pgx.Conn, stop chan os.Signal) error {
|
|
err := pkg.BareImport(db, stop)
|
|
if err != nil {
|
|
pkg.LastError = err.Error()
|
|
log.Printf("Import of package %s failed with error %s", pkg.String(), err)
|
|
} else {
|
|
log.Printf("Import of package %s completed successfully", pkg.String())
|
|
pkg.LastEvent = ""
|
|
pkg.LastError = ""
|
|
}
|
|
|
|
e := pkg.Persist(db)
|
|
if e != nil {
|
|
return e
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (pkg *GithubPackage) String() string {
|
|
return fmt.Sprintf("%s-%d", pkg.ID.UTC().Format("2006-01-02"), pkg.ID.UTC().Hour())
|
|
}
|
|
|
|
type InterruptedError struct {
|
|
Signal os.Signal
|
|
}
|
|
|
|
func (err InterruptedError) Error() string {
|
|
return fmt.Sprintf("Interrupted by %s", err.Signal.String())
|
|
}
|
|
|
|
func (pkg *GithubPackage) BareImport(db *pgx.Conn, stop chan os.Signal) error {
|
|
url := fmt.Sprintf("https://data.gharchive.org/%s.json.gz", pkg.String())
|
|
log.Printf("Fetching %s...", url)
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if resp.StatusCode != 200 {
|
|
return fmt.Errorf("Wrong status code: %d", resp.StatusCode)
|
|
}
|
|
|
|
gz, err := gzip.NewReader(resp.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer gz.Close()
|
|
|
|
var buffer []byte = make([]byte, MaxTokenSize)
|
|
scanner := bufio.NewScanner(gz)
|
|
scanner.Buffer(buffer, MaxTokenSize)
|
|
var skip bool = false
|
|
if pkg.LastEvent != "" {
|
|
log.Printf("Resuming import at %s...", pkg.LastEvent)
|
|
skip = true
|
|
} else {
|
|
log.Printf("Started import of package %s...", pkg.String())
|
|
}
|
|
|
|
for scanner.Scan() {
|
|
var evt GithubEvent
|
|
err := json.Unmarshal(scanner.Bytes(), &evt)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if skip {
|
|
if pkg.LastEvent == evt.Id {
|
|
skip = false
|
|
log.Printf("Started import of package %s...", pkg.String())
|
|
}
|
|
continue
|
|
}
|
|
|
|
if evt.IsRelevant() {
|
|
tx, err := db.Begin(context.Background())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
defer tx.Rollback(context.Background())
|
|
|
|
switch {
|
|
case evt.Type == "CreateEvent":
|
|
rows, err := tx.Query(context.Background(), CreateEventQuery,
|
|
evt.Repository.Id,
|
|
evt.Repository.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
case evt.Type == "WatchEvent":
|
|
rows, err := tx.Query(context.Background(), WatchEventQuery,
|
|
evt.Repository.Id,
|
|
evt.Repository.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
case evt.Type == "PullRequestEvent":
|
|
rows, err := tx.Query(context.Background(), PullRequestEventQuery,
|
|
evt.Repository.Id,
|
|
evt.Repository.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
case evt.Type == "ForkEvent":
|
|
rows, err := tx.Query(context.Background(), ForkEventQueryParent,
|
|
evt.Repository.Id,
|
|
evt.Repository.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
rows.Close()
|
|
rows, err = tx.Query(context.Background(), ForkEventQueryChild,
|
|
evt.Payload.Forkee.Id,
|
|
evt.Payload.Forkee.Name,
|
|
evt.Repository.Id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
case evt.Type == "IssuesEvent":
|
|
rows, err := tx.Query(context.Background(), IssuesEventQuery,
|
|
evt.Repository.Id,
|
|
evt.Repository.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
case evt.Type == "ReleaseEvent":
|
|
rows, err := tx.Query(context.Background(), ReleaseEventQueryRepo,
|
|
evt.Repository.Id,
|
|
evt.Repository.Name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
|
|
rows, err = tx.Query(context.Background(), ReleaseEventQueryRelease,
|
|
evt.Payload.Release.Id,
|
|
evt.Repository.Id,
|
|
evt.Timestamp,
|
|
evt.Payload.Release.TagName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rows.Close()
|
|
}
|
|
err = tx.Commit(context.Background())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
pkg.LastEvent = evt.Id
|
|
|
|
select {
|
|
case sig := <-stop:
|
|
return &InterruptedError{Signal: sig}
|
|
default:
|
|
}
|
|
}
|
|
if err := scanner.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
pkg.Done = true
|
|
return nil
|
|
}
|
|
|