Skip to content
This repository was archived by the owner on Dec 13, 2022. It is now read-only.

Commit e2df7f3

Browse files
committed
Check host after redirect
Fixes #4
1 parent 8221ccf commit e2df7f3

2 files changed

Lines changed: 48 additions & 7 deletions

File tree

httpsyet/crawler.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ func toURLs(links []string, parse func(string) (*url.URL, error)) (urls []*url.U
122122
invalids = append(invalids, fmt.Sprintf("%s (%v)", s, e))
123123
continue
124124
}
125+
// Default to https
126+
if u.Scheme == "" {
127+
u.Scheme = "https"
128+
}
125129
// Ignore invalid protocols
126130
if u.Scheme == "http" || u.Scheme == "https" {
127131
urls = append(urls, u)
@@ -240,6 +244,11 @@ func crawlSite(s site, get func(string) (*http.Response, error)) ([]string, bool
240244
return nil, false, fmt.Errorf("%d %v", r.StatusCode, u)
241245
}
242246

247+
// Stop when redirecting to external page
248+
if r.Request.URL.Host != u.Host {
249+
isExternal = true
250+
}
251+
243252
// Stop when site is external.
244253
// Also stop if depth one is reached, ignored when depth is set to 0.
245254
if isExternal || s.Depth == 1 {

httpsyet/crawler_test.go

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ const (
5555
This is an relative internal link to a page without children.
5656
</a>
5757
58+
<a href="/redirect">
59+
This is an internal link but it redirects to an external page.
60+
</a>
61+
5862
<a href="{{ .Self }}/sub">
5963
This is an internal link.
6064
</a>
@@ -93,6 +97,12 @@ const (
9397

9498
basic = `
9599
<h1>Welcome to the basic page</h1>
100+
`
101+
102+
redirectTarget = `
103+
<a href="/no-follow">
104+
This link after redirecting should not be followed.
105+
</a>
96106
`
97107
)
98108

@@ -113,20 +123,31 @@ func TestRun(t *testing.T) {
113123
}
114124
}
115125

126+
redirect := func(name, u string) http.HandlerFunc {
127+
visited[name] = 0
128+
return func(w http.ResponseWriter, r *http.Request) {
129+
visited[name]++
130+
t.Logf("visited page: %s", name)
131+
http.Redirect(w, r, u, http.StatusMovedPermanently)
132+
}
133+
}
134+
135+
httpMux := http.NewServeMux()
136+
httpMux.HandleFunc("/page-a", serve("http/page-a", basic))
137+
httpMux.HandleFunc("/page-b", serve("http/page-b", basic))
138+
httpMux.HandleFunc("/redirect-target", serve("http/redirect-target", redirectTarget))
139+
httpServer := httptest.NewServer(httpMux)
140+
defer httpServer.Close()
141+
116142
pageMux := http.NewServeMux()
117143
pageMux.HandleFunc("/base", serve("page/base", basePage))
118144
pageMux.HandleFunc("/sub", serve("page/sub", subPage))
119145
pageMux.HandleFunc("/empty-sub", serve("page/empty-sub", basic))
120146
pageMux.HandleFunc("/sub/sub", serve("page/sub/sub", basic))
147+
pageMux.HandleFunc("/redirect", redirect("page/redirect", httpServer.URL+"/redirect-target"))
121148
pageServer := httptest.NewServer(pageMux)
122149
defer pageServer.Close()
123150

124-
httpMux := http.NewServeMux()
125-
httpMux.HandleFunc("/page-a", serve("http/page-a", basic))
126-
httpMux.HandleFunc("/page-b", serve("http/page-b", basic))
127-
httpServer := httptest.NewServer(httpMux)
128-
defer httpServer.Close()
129-
130151
tlsMux := http.NewServeMux()
131152
tlsMux.HandleFunc("/base", serve("tls/base", basic))
132153
tlsMux.HandleFunc("/base2", serve("tls/base2", basic))
@@ -244,7 +265,13 @@ func TestRunSingle(t *testing.T) {
244265
noErr(t, err)
245266

246267
expect := fmt.Sprintf(
247-
"404 %s/404 on page %s/base\n404 %s/404 on page %s/sub\n404 %s/404 on page %s/sub\n",
268+
`404 %s/404 on page %s/base
269+
404 %s/redirect on page %s/base
270+
404 %s/404 on page %s/sub
271+
404 %s/404 on page %s/sub
272+
`,
273+
pageServer.URL,
274+
pageServer.URL,
248275
pageServer.URL,
249276
pageServer.URL,
250277
httpServer.URL,
@@ -343,6 +370,8 @@ verbose: GET %s/page-a
343370
verbose: GET %s/404
344371
404 %s/404 on page %s/base
345372
verbose: GET %s/empty-sub
373+
verbose: GET %s/redirect
374+
404 %s/redirect on page %s/base
346375
verbose: GET %s/sub
347376
`,
348377
pageServer.URL,
@@ -358,6 +387,9 @@ verbose: GET %s/sub
358387
pageServer.URL,
359388
pageServer.URL,
360389
pageServer.URL,
390+
pageServer.URL,
391+
pageServer.URL,
392+
pageServer.URL,
361393
)
362394
eqLines(t, expect, errs.String(), "unexpected errors")
363395

0 commit comments

Comments
 (0)