forked from PuerkitoBio/gocrawl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexamples_test.go
49 lines (38 loc) · 1.38 KB
/
examples_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
package gocrawl
import (
"github.com/PuerkitoBio/goquery"
"net/http"
"regexp"
"time"
)
// Only enqueue the root and paths beginning with an "a"
var rxOk = regexp.MustCompile(`http://duckduckgo\.com(/a.*)?$`)
// Create the Extender implementation, based on the gocrawl-provided DefaultExtender,
// because we don't want/need to override all methods.
type ExampleExtender struct {
DefaultExtender // Will use the default implementation of all but Visit() and Filter()
}
// Override Visit for our need.
func (this *ExampleExtender) Visit(ctx *URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
// Use the goquery document or res.Body to manipulate the data
// ...
// Return nil and true - let gocrawl find the links
return nil, true
}
// Override Filter for our need.
func (this *ExampleExtender) Filter(ctx *URLContext, isVisited bool) bool {
return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String())
}
func ExampleCrawl() {
// Set custom options
opts := NewOptions(new(ExampleExtender))
opts.CrawlDelay = 1 * time.Second
opts.LogFlags = LogAll
// Play nice with ddgo when running the test!
opts.MaxVisits = 2
// Create crawler and start at root of duckduckgo
c := NewCrawlerWithOptions(opts)
c.Run("https://duckduckgo.com/")
// Remove "x" before Output: to activate the example (will run on go test)
// xOutput: voluntarily fail to see log output
}