Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

fix : retry redirect to AlreadyVisitedUrl will loop error #826

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
12 changes: 9 additions & 3 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,10 @@ var collectorCounter uint32
type key int

// ProxyURLKey is the context key for the request proxy address.
const ProxyURLKey key = iota
const (
ProxyURLKey key = iota
CheckRevisitKey
)

var (
// ErrForbiddenDomain is the error thrown if visiting
Expand Down Expand Up @@ -650,7 +653,8 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
}
// note: once 1.13 is minimum supported Go version,
// replace this with http.NewRequestWithContext
req = req.WithContext(c.Context)
req = req.WithContext(context.WithValue(c.Context, CheckRevisitKey, checkRevisit))

if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil {
return err
}
Expand Down Expand Up @@ -1382,7 +1386,9 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
return err
}
if visited {
return &AlreadyVisitedError{req.URL}
if checkRevisit, ok := req.Context().Value(CheckRevisitKey).(bool); !ok || checkRevisit {
return &AlreadyVisitedError{req.URL}
}
}
err = c.store.Visited(uHash)
if err != nil {
Expand Down
22 changes: 22 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1814,3 +1814,25 @@ func TestCollectorPostRetryUnseekable(t *testing.T) {
t.Error("OnResponse Retry was called but BodyUnseekable")
}
}

func TestRedirectErrorRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnError(func(r *Response, err error) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if e := (&AlreadyVisitedError{}); errors.As(err, &e) {
t.Error("loop AlreadyVisitedError")
}

})
c.OnResponse(func(response *Response) {
//println(1)
})
c.Visit(ts.URL + "/redirected/")
c.Visit(ts.URL + "/redirect")
}
Loading