-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.rb
44 lines (37 loc) · 889 Bytes
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
require 'rubygems'
require 'spidr'
require 'robots'
require 'pry'
# FROM: http://ruby.bastardsbook.com/chapters/web-crawling/#h-2-2
# 1.) Crawl link to link
# 2.) Crawl by using search
class Crawler
IGNORED_EXTENSIONS = ['js', 'css', 'pdf', 'png', 'ico', 'doc', 'docx', 'ppt']
def initialize(root_url)
@root_url = root_url
@urls = []
puts "Crawling #{root_url}"
end
def call
Spidr.site(@root_url,
robots: true,
ignore_exts: IGNORED_EXTENSIONS
) do |spider|
spider.every_url do |url|
puts url
@urls << url
end
end
puts
puts "URLs:"
puts @urls
end
end
COUNCIL_PAGES = [
"https://www.islington.gov.uk",
"http://www.camden.gov.uk",
"https://www.lambeth.gov.uk",
"https://www.westminster.gov.uk",
"http://www.basildon.gov.uk/",
]
Crawler.new(COUNCIL_PAGES.sample).call