From 1f08c7000998de665667b134562a1c5a19298b75 Mon Sep 17 00:00:00 2001 From: Dan Carley Date: Tue, 31 Jul 2018 23:33:14 +0100 Subject: [PATCH] Implement --exclude argument So that you can pass a regex of links (paths or URLs) to exclude from checking. My use case for this is legitimate links in documentation that may not function at the time of checking, for example: - development servers - private repos I've implemented this all the way down in `urlChecker`, rather than `fileChecker`, so that we can compare the fully resolved path/URL. I would have found some of the codebase easier to read if longer variable names had been used and struct fields specified, but I've matched the house-style for now. --- README.md | 3 ++- arguments.go | 14 +++++++++++++- arguments_test.go | 31 ++++++++++++++++++++----------- file_checker.go | 5 +++-- file_checker_test.go | 8 ++++---- main.go | 2 +- url_checker.go | 11 ++++++++--- url_checker_test.go | 25 +++++++++++++++++++------ 8 files changed, 70 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 74c1e03..6f0dfc7 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,14 @@ go get -u github.com/raviqqe/liche Link checker for Markdown and HTML Usage: - liche [-c ] [-d ] [-r] [-t ] [-v] ... + liche [-c ] [-d ] [-r] [-t ] [-x ] [-v] ... Options: -c, --concurrency Set max number of concurrent HTTP requests. [default: 512] -d, --document-root Set document root directory for absolute paths. -r, --recursive Search Markdown and HTML files recursively -t, --timeout Set timeout for HTTP requests in seconds. Disabled by default. + -x, --exclude Regex of links to exclude from checking. -v, --verbose Be verbose. ``` diff --git a/arguments.go b/arguments.go index dd945de..735278f 100644 --- a/arguments.go +++ b/arguments.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "regexp" "strconv" "time" @@ -13,13 +14,14 @@ const defaultConcurrency = maxOpenFiles / 2 const usage = `Link checker for Markdown and HTML Usage: - liche [-c ] [-d ] [-r] [-t ] [-v] ... + liche [-c ] [-d ] [-r] [-t ] [-x ] [-v] ... Options: -c, --concurrency Set max number of concurrent HTTP requests. [default: %v] -d, --document-root Set document root directory for absolute paths. -r, --recursive Search Markdown and HTML files recursively -t, --timeout Set timeout for HTTP requests in seconds. Disabled by default. + -x, --exclude Regex of links to exclude from checking. -v, --verbose Be verbose.` type arguments struct { @@ -28,6 +30,7 @@ type arguments struct { concurrency int timeout time.Duration recursive bool + exclude *regexp.Regexp verbose bool } @@ -58,12 +61,21 @@ func getArguments(argv []string) (arguments, error) { } } + var exclude *regexp.Regexp + if args["--exclude"] != nil { + exclude, err = regexp.Compile(args["--exclude"].(string)) + if err != nil { + return arguments{}, err + } + } + return arguments{ args[""].([]string), args["--document-root"].(string), int(c), time.Duration(t) * time.Second, args["--recursive"].(bool), + exclude, args["--verbose"].(bool), }, nil } diff --git a/arguments_test.go b/arguments_test.go index eae6f38..76c4098 100644 --- a/arguments_test.go +++ b/arguments_test.go @@ -1,6 +1,7 @@ package main import ( + "regexp" "testing" "time" @@ -14,47 +15,55 @@ func TestGetArguments(t *testing.T) { }{ { argv: []string{"file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, nil, false}, }, { argv: []string{"-c", "42", "file"}, - args: arguments{[]string{"file"}, "", 42, 0, false, false}, + args: arguments{[]string{"file"}, "", 42, 0, false, nil, false}, }, { argv: []string{"--concurrency", "42", "file"}, - args: arguments{[]string{"file"}, "", 42, 0, false, false}, + args: arguments{[]string{"file"}, "", 42, 0, false, nil, false}, }, { argv: []string{"-d", "directory", "file"}, - args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, false, false}, + args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, false, nil, false}, }, { argv: []string{"--document-root", "directory", "file"}, - args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, false, false}, + args: arguments{[]string{"file"}, "directory", defaultConcurrency, 0, false, nil, false}, }, { argv: []string{"-r", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, true, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, true, nil, false}, }, { argv: []string{"--recursive", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, true, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, true, nil, false}, }, { argv: []string{"-t", "42", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, false, nil, false}, }, { argv: []string{"--timeout", "42", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, false, false}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 42 * time.Second, false, nil, false}, + }, + { + argv: []string{"-x", "^.*$", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, regexp.MustCompile(`^.*$`), false}, + }, + { + argv: []string{"--exclude", "^.*$", "file"}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, regexp.MustCompile(`^.*$`), false}, }, { argv: []string{"-v", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, true}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, nil, true}, }, { argv: []string{"--verbose", "file"}, - args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, true}, + args: arguments{[]string{"file"}, "", defaultConcurrency, 0, false, nil, true}, }, } { args, err := getArguments(c.argv) diff --git a/file_checker.go b/file_checker.go index cb605a8..7c456be 100644 --- a/file_checker.go +++ b/file_checker.go @@ -4,6 +4,7 @@ import ( "bytes" "io/ioutil" "net/url" + "regexp" "strings" "sync" "time" @@ -17,8 +18,8 @@ type fileChecker struct { semaphore semaphore } -func newFileChecker(timeout time.Duration, d string, s semaphore) fileChecker { - return fileChecker{newURLChecker(timeout, d, s), s} +func newFileChecker(timeout time.Duration, d string, x *regexp.Regexp, s semaphore) fileChecker { + return fileChecker{newURLChecker(timeout, d, x, s), s} } func (c fileChecker) Check(f string) ([]urlResult, error) { diff --git a/file_checker_test.go b/file_checker_test.go index f2c54e7..06a70c5 100644 --- a/file_checker_test.go +++ b/file_checker_test.go @@ -10,7 +10,7 @@ import ( ) func TestFileCheckerCheck(t *testing.T) { - c := newFileChecker(0, "", newSemaphore(1024)) + c := newFileChecker(0, "", nil, newSemaphore(1024)) for _, f := range []string{"README.md", "test/foo.md", "test/foo.html"} { rs, err := c.Check(f) @@ -48,7 +48,7 @@ func TestFileCheckerCheck(t *testing.T) { } func TestFileCheckerCheckMany(t *testing.T) { - c := newFileChecker(0, "", newSemaphore(maxOpenFiles)) + c := newFileChecker(0, "", nil, newSemaphore(maxOpenFiles)) for _, fs := range [][]string{ {"README.md"}, @@ -77,7 +77,7 @@ func TestFileCheckerCheckMany(t *testing.T) { } func TestFileCheckerCheckManyWithInvalidFiles(t *testing.T) { - c := newFileChecker(0, "", newSemaphore(maxOpenFiles)) + c := newFileChecker(0, "", nil, newSemaphore(maxOpenFiles)) for _, fs := range [][]string{ {"test/absolute_path.md"}, @@ -107,7 +107,7 @@ func TestFileCheckerCheckManyWithInvalidFiles(t *testing.T) { } func TestFileCheckerExtractURLs(t *testing.T) { - c := newFileChecker(0, "", newSemaphore(42)) + c := newFileChecker(0, "", nil, newSemaphore(42)) for _, x := range []struct { html string diff --git a/main.go b/main.go index d4154b4..a2f8d4b 100644 --- a/main.go +++ b/main.go @@ -30,7 +30,7 @@ func main() { }() rc := make(chan fileResult, maxOpenFiles) - c := newFileChecker(args.timeout, args.documentRoot, newSemaphore(args.concurrency)) + c := newFileChecker(args.timeout, args.documentRoot, args.exclude, newSemaphore(args.concurrency)) go c.CheckMany(m.Filenames(), rc) diff --git a/url_checker.go b/url_checker.go index c570975..5e4713a 100644 --- a/url_checker.go +++ b/url_checker.go @@ -5,6 +5,7 @@ import ( "net/url" "os" "path" + "regexp" "sync" "time" @@ -14,20 +15,24 @@ import ( type urlChecker struct { timeout time.Duration documentRoot string + exclude *regexp.Regexp semaphore semaphore } -func newURLChecker(t time.Duration, d string, s semaphore) urlChecker { - return urlChecker{t, d, s} +func newURLChecker(t time.Duration, d string, x *regexp.Regexp, s semaphore) urlChecker { + return urlChecker{t, d, x, s} } func (c urlChecker) Check(u string, f string) error { u, local, err := c.resolveURL(u, f) - if err != nil { return err } + if c.exclude != nil && c.exclude.MatchString(u) { + return nil + } + if local { _, err := os.Stat(u) return err diff --git a/url_checker_test.go b/url_checker_test.go index 03b2d89..435976b 100644 --- a/url_checker_test.go +++ b/url_checker_test.go @@ -1,6 +1,7 @@ package main import ( + "regexp" "testing" "time" @@ -8,7 +9,7 @@ import ( ) func TestURLCheckerCheck(t *testing.T) { - c := newURLChecker(0, "", newSemaphore(1024)) + c := newURLChecker(0, "", nil, newSemaphore(1024)) for _, u := range []string{"https://google.com", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -19,8 +20,20 @@ func TestURLCheckerCheck(t *testing.T) { } } +func TestURLCheckerCheckWithExclude(t *testing.T) { + c := newURLChecker(0, "", regexp.MustCompile(`^http:\/\/localhost:[13]$`), newSemaphore(1024)) + + for _, u := range []string{"http://localhost:1", "http://localhost:3", "README.md"} { + assert.Equal(t, nil, c.Check(u, "README.md")) + } + + for _, u := range []string{"http://localhost:2", "READYOU.md"} { + assert.NotEqual(t, nil, c.Check(u, "README.md")) + } +} + func TestURLCheckerCheckWithTimeout(t *testing.T) { - c := newURLChecker(30*time.Second, "", newSemaphore(1024)) + c := newURLChecker(30*time.Second, "", nil, newSemaphore(1024)) for _, u := range []string{"https://google.com", "README.md"} { assert.Equal(t, nil, c.Check(u, "README.md")) @@ -32,7 +45,7 @@ func TestURLCheckerCheckWithTimeout(t *testing.T) { } func TestURLCheckerCheckMany(t *testing.T) { - c := newURLChecker(0, "", newSemaphore(1024)) + c := newURLChecker(0, "", nil, newSemaphore(1024)) for _, us := range [][]string{{}, {"https://google.com", "README.md"}} { rc := make(chan urlResult, 1024) @@ -45,7 +58,7 @@ func TestURLCheckerCheckMany(t *testing.T) { } } func TestURLCheckerResolveURL(t *testing.T) { - f := newURLChecker(0, "", newSemaphore(1024)) + f := newURLChecker(0, "", nil, newSemaphore(1024)) for _, c := range []struct { source, target string @@ -63,7 +76,7 @@ func TestURLCheckerResolveURL(t *testing.T) { } func TestURLCheckerResolveURLWithAbsolutePath(t *testing.T) { - f := newURLChecker(0, "", newSemaphore(1024)) + f := newURLChecker(0, "", nil, newSemaphore(1024)) u, _, err := f.resolveURL("/foo", "foo.md") @@ -72,7 +85,7 @@ func TestURLCheckerResolveURLWithAbsolutePath(t *testing.T) { } func TestURLCheckerResolveURLWithDocumentRoot(t *testing.T) { - f := newURLChecker(0, "foo", newSemaphore(1024)) + f := newURLChecker(0, "foo", nil, newSemaphore(1024)) for _, c := range []struct { source, target string