From 522a2c18e77f102daba808eacb9fec19bf860a83 Mon Sep 17 00:00:00 2001 From: Yota Toyama Date: Wed, 22 Nov 2017 22:53:02 +0900 Subject: [PATCH] Resolve URLs while checking them --- file_checker.go | 45 +++++------------------------------ file_checker_test.go | 56 -------------------------------------------- url_checker.go | 29 +++++++++++++++++++---- url_checker_test.go | 43 +++++++++++++++++++++++++++++++--- 4 files changed, 71 insertions(+), 102 deletions(-) diff --git a/file_checker.go b/file_checker.go index a338ad5..cb605a8 100644 --- a/file_checker.go +++ b/file_checker.go @@ -2,10 +2,8 @@ package main import ( "bytes" - "errors" "io/ioutil" "net/url" - "path" "strings" "sync" "time" @@ -15,13 +13,12 @@ import ( ) type fileChecker struct { - urlChecker urlChecker - documentRoot string - semaphore semaphore + urlChecker urlChecker + semaphore semaphore } -func newFileChecker(timeout time.Duration, r string, s semaphore) fileChecker { - return fileChecker{newURLChecker(timeout, s), r, s} +func newFileChecker(timeout time.Duration, d string, s semaphore) fileChecker { + return fileChecker{newURLChecker(timeout, d, s), s} } func (c fileChecker) Check(f string) ([]urlResult, error) { @@ -96,18 +93,6 @@ func (c fileChecker) extractURLs(n *html.Node) ([]string, error) { us := make(map[string]bool) ns := []*html.Node{n} - addURL := func(u string) error { - u, err := c.resolveURL(u) - - if err != nil { - return err - } - - us[u] = true - - return nil - } - for len(ns) > 0 { i := len(ns) - 1 n := ns[i] @@ -118,20 +103,14 @@ func (c fileChecker) extractURLs(n *html.Node) ([]string, error) { case "a": for _, a := range n.Attr { if a.Key == "href" && isURL(a.Val) { - if err := addURL(a.Val); err != nil { - return nil, err - } - + us[a.Val] = true break } } case "img": for _, a := range n.Attr { if a.Key == "src" && isURL(a.Val) { - if err := addURL(a.Val); err != nil { - return nil, err - } - + us[a.Val] = true break } } @@ -146,18 +125,6 @@ func (c fileChecker) extractURLs(n *html.Node) ([]string, error) { return stringSetToSlice(us), nil } -func (c fileChecker) resolveURL(u string) (string, error) { - abs := strings.HasPrefix(u, "/") - - if abs && c.documentRoot != "" { - return path.Join(c.documentRoot, u), nil - } else if abs { - return "", errors.New("document root directory is not specified") - } - - return u, nil -} - func isURL(s string) bool { if strings.HasPrefix(s, "#") { return false diff --git a/file_checker_test.go b/file_checker_test.go index 31888aa..f2c54e7 100644 --- a/file_checker_test.go +++ b/file_checker_test.go @@ -144,62 +144,6 @@ func TestFileCheckerExtractURLs(t *testing.T) { } } -func TestFileCheckerExtractURLsWithInvalidHTML(t *testing.T) { - c := newFileChecker(0, "", newSemaphore(42)) - - for _, s := range []string{ - `link`, - ``, - } { - n, err := html.Parse(strings.NewReader(s)) - - assert.Equal(t, nil, err) - - us, err := c.extractURLs(n) - - assert.Equal(t, ([]string)(nil), us) - assert.NotEqual(t, nil, err) - } -} - -func TestFileCheckerResolveURL(t *testing.T) { - f := newFileChecker(0, "", newSemaphore(1024)) - - for _, c := range []struct{ source, target string }{ - {"foo", "foo"}, - {"https://google.com", "https://google.com"}, - } { - u, err := f.resolveURL(c.source) - - assert.Equal(t, nil, err) - assert.Equal(t, c.target, u) - } -} - -func TestFileCheckerResolveURLWithAbsolutePath(t *testing.T) { - f := newFileChecker(0, "", newSemaphore(1024)) - - u, err := f.resolveURL("/foo") - - assert.NotEqual(t, nil, err) - assert.Equal(t, "", u) -} - -func TestFileCheckerResolveURLWithDocumentRoot(t *testing.T) { - f := newFileChecker(0, "foo", newSemaphore(1024)) - - for _, c := range []struct{ source, target string }{ - {"foo", "foo"}, - {"https://google.com", "https://google.com"}, - {"/foo", "foo/foo"}, - } { - u, err := f.resolveURL(c.source) - - assert.Equal(t, nil, err) - assert.Equal(t, c.target, u) - } -} - func TestURLParse(t *testing.T) { u, err := url.Parse("file-path") diff --git a/url_checker.go b/url_checker.go index 8d897c6..7fab12d 100644 --- a/url_checker.go +++ b/url_checker.go @@ -1,9 +1,11 @@ package main import ( + "errors" "net/url" "os" "path" + "strings" "sync" "time" @@ -11,15 +13,22 @@ import ( ) type urlChecker struct { - timeout time.Duration - semaphore semaphore + timeout time.Duration + documentRoot string + semaphore semaphore } -func newURLChecker(t time.Duration, s semaphore) urlChecker { - return urlChecker{t, s} +func newURLChecker(t time.Duration, d string, s semaphore) urlChecker { + return urlChecker{t, d, s} } func (c urlChecker) Check(u string, f string) error { + u, err := c.resolveURL(u) + + if err != nil { + return err + } + uu, err := url.Parse(u) if err != nil { @@ -58,6 +67,18 @@ func (c urlChecker) CheckMany(us []string, f string, rc chan<- urlResult) { close(rc) } +func (c urlChecker) resolveURL(u string) (string, error) { + abs := strings.HasPrefix(u, "/") + + if abs && c.documentRoot != "" { + return path.Join(c.documentRoot, u), nil + } else if abs { + return "", errors.New("document root directory is not specified") + } + + return u, nil +} + func checkRelativePath(p string, f string) error { _, err := os.Stat(path.Join(path.Dir(f), p)) return err diff --git a/url_checker_test.go b/url_checker_test.go index 3949262..8afd0c2 100644 --- a/url_checker_test.go +++ b/url_checker_test.go @@ -8,7 +8,7 @@ import ( ) func TestURLCheckerCheck(t *testing.T) { - c := newURLChecker(0, newSemaphore(1024)) + c := newURLChecker(0, "", newSemaphore(1024)) for _, u := range []string{"https://google.com", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -20,7 +20,7 @@ func TestURLCheckerCheck(t *testing.T) { } func TestURLCheckerCheckWithTimeout(t *testing.T) { - c := newURLChecker(30*time.Second, newSemaphore(1024)) + c := newURLChecker(30*time.Second, "", newSemaphore(1024)) for _, u := range []string{"https://google.com", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -32,7 +32,7 @@ func TestURLCheckerCheckWithTimeout(t *testing.T) { } func TestURLCheckerCheckMany(t *testing.T) { - c := newURLChecker(0, newSemaphore(1024)) + c := newURLChecker(0, "", newSemaphore(1024)) for _, us := range [][]string{{}, {"https://google.com", "README.md"}} { rc := make(chan urlResult, 1024) @@ -44,3 +44,40 @@ func TestURLCheckerCheckMany(t *testing.T) { } } } +func TestURLCheckerResolveURL(t *testing.T) { + f := newURLChecker(0, "", newSemaphore(1024)) + + for _, c := range []struct{ source, target string }{ + {"foo", "foo"}, + {"https://google.com", "https://google.com"}, + } { + u, err := f.resolveURL(c.source) + + assert.Equal(t, nil, err) + assert.Equal(t, c.target, u) + } +} + +func TestURLCheckerResolveURLWithAbsolutePath(t *testing.T) { + f := newURLChecker(0, "", newSemaphore(1024)) + + u, err := f.resolveURL("/foo") + + assert.NotEqual(t, nil, err) + assert.Equal(t, "", u) +} + +func TestURLCheckerResolveURLWithDocumentRoot(t *testing.T) { + f := newURLChecker(0, "foo", newSemaphore(1024)) + + for _, c := range []struct{ source, target string }{ + {"foo", "foo"}, + {"https://google.com", "https://google.com"}, + {"/foo", "foo/foo"}, + } { + u, err := f.resolveURL(c.source) + + assert.Equal(t, nil, err) + assert.Equal(t, c.target, u) + } +}