|
|
@ -2,8 +2,10 @@ package main |
|
|
|
|
|
|
|
|
import ( |
|
|
import ( |
|
|
"bytes" |
|
|
"bytes" |
|
|
|
|
|
"errors" |
|
|
"io/ioutil" |
|
|
"io/ioutil" |
|
|
"net/url" |
|
|
"net/url" |
|
|
|
|
|
"path" |
|
|
"strings" |
|
|
"strings" |
|
|
"sync" |
|
|
"sync" |
|
|
"time" |
|
|
"time" |
|
|
@ -14,10 +16,11 @@ import ( |
|
|
|
|
|
|
|
|
type fileChecker struct { |
|
|
type fileChecker struct { |
|
|
urlChecker urlChecker |
|
|
urlChecker urlChecker |
|
|
|
|
|
documentRoot string |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
func newFileChecker(timeout time.Duration, s semaphore) fileChecker { |
|
|
func newFileChecker(timeout time.Duration, r string, s semaphore) fileChecker { |
|
|
return fileChecker{newURLChecker(timeout, s)} |
|
|
return fileChecker{newURLChecker(timeout, s), r} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
func (c fileChecker) Check(f string) ([]urlResult, error) { |
|
|
func (c fileChecker) Check(f string) ([]urlResult, error) { |
|
|
@ -27,7 +30,12 @@ func (c fileChecker) Check(f string) ([]urlResult, error) { |
|
|
return nil, err |
|
|
return nil, err |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
us := extractURLs(n) |
|
|
us, err := c.extractURLs(n) |
|
|
|
|
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return nil, err |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
rc := make(chan urlResult, len(us)) |
|
|
rc := make(chan urlResult, len(us)) |
|
|
rs := make([]urlResult, 0, len(us)) |
|
|
rs := make([]urlResult, 0, len(us)) |
|
|
|
|
|
|
|
|
@ -81,11 +89,23 @@ func parseFile(f string) (*html.Node, error) { |
|
|
return n, nil |
|
|
return n, nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
func extractURLs(n *html.Node) []string { |
|
|
func (c fileChecker) extractURLs(n *html.Node) ([]string, error) { |
|
|
us := make(map[string]bool) |
|
|
us := make(map[string]bool) |
|
|
ns := make([]*html.Node, 0, 1024) |
|
|
ns := make([]*html.Node, 0, 1024) |
|
|
ns = append(ns, n) |
|
|
ns = append(ns, n) |
|
|
|
|
|
|
|
|
|
|
|
addURL := func(u string) error { |
|
|
|
|
|
u, err := c.resolveURL(u) |
|
|
|
|
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return err |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
us[u] = true |
|
|
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
for len(ns) > 0 { |
|
|
for len(ns) > 0 { |
|
|
i := len(ns) - 1 |
|
|
i := len(ns) - 1 |
|
|
n := ns[i] |
|
|
n := ns[i] |
|
|
@ -96,14 +116,20 @@ func extractURLs(n *html.Node) []string { |
|
|
case "a": |
|
|
case "a": |
|
|
for _, a := range n.Attr { |
|
|
for _, a := range n.Attr { |
|
|
if a.Key == "href" && isURL(a.Val) { |
|
|
if a.Key == "href" && isURL(a.Val) { |
|
|
us[a.Val] = true |
|
|
if err := addURL(a.Val); err != nil { |
|
|
|
|
|
return nil, err |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
break |
|
|
break |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
case "img": |
|
|
case "img": |
|
|
for _, a := range n.Attr { |
|
|
for _, a := range n.Attr { |
|
|
if a.Key == "src" && isURL(a.Val) { |
|
|
if a.Key == "src" && isURL(a.Val) { |
|
|
us[a.Val] = true |
|
|
if err := addURL(a.Val); err != nil { |
|
|
|
|
|
return nil, err |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
break |
|
|
break |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
@ -115,7 +141,19 @@ func extractURLs(n *html.Node) []string { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return stringSetToSlice(us) |
|
|
return stringSetToSlice(us), nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
func (c fileChecker) resolveURL(u string) (string, error) { |
|
|
|
|
|
abs := strings.HasPrefix(u, "/") |
|
|
|
|
|
|
|
|
|
|
|
if abs && c.documentRoot != "" { |
|
|
|
|
|
return path.Join(c.documentRoot, u), nil |
|
|
|
|
|
} else if abs { |
|
|
|
|
|
return "", errors.New("document root directory is not specified") |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return u, nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
func isURL(s string) bool { |
|
|
func isURL(s string) bool { |
|
|
|