-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper
executable file
·92 lines (84 loc) · 3.09 KB
/
scraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
require 'pp'
require 'lib/tumblr'
INITIAL_BLOG_QUEUE = [
'lukegrecki.com',
'gastrogirl.tumblr.com'
]
INITIAL_BLOG_QUEUE.each do |blog_domain|
puts "Starting on #{blog_domain}"
offset = 0
raw_blog = Tumblr.get_blog(blog_domain)
blog_name = Blog.canonical_name(raw_blog['name'])
blog = Blog.find_or_create_by_name(blog_name, :url => raw_blog['url'])
while true
raw_posts = Tumblr.get_posts(blog_domain, :offset => offset)
break if raw_posts.empty?
uniques = Post.find_all_by_reblogged_root_post_id(0).count
puts "(#{raw_posts.length} entries from #{blog_domain}, #{Post.count} posts scraped with #{uniques} uniques)"
raw_posts.each do |raw|
begin
tumblr_id = raw["id"]
root_post_id = nil
if raw["reblogged_root_name"] == ""
# This must be reblogged from a private blog.
puts "skipping private root (#{blog_name}, #{tumblr_id}) with #{raw['note_count']} notes"
next
elsif raw['reblogged_root_name']
root_blog_name = Blog.canonical_name(raw['reblogged_root_name'])
root_blog = Blog.find_or_create_by_name(root_blog_name)
root_blog.save!
# The tumblr post ID isn't provided, so parse it from the URL
root_post_url = raw['reblogged_root_url'].sub(%r{-deactivated\d+-?.tumblr.com}, '.tumblr.com')
root_tumblr_id = root_post_url[%r{/post/(\d+)($|/)}, 1]
root_post = Post.find_or_create_by_blog_id_and_tumblr_id(
root_blog.id,
root_tumblr_id,
:url => raw['reblogged_root_url'],
:reblogged_root_post_id => 0)
root_post.save!
root_post_id = root_post.id
else
# A root post id of 0 indicates that this post is its own root.
root_post_id = 0
end
post = Post.find_or_create_by_blog_id_and_tumblr_id(
blog.id,
tumblr_id,
:timestamp => raw['timestamp'],
:url => raw['post_url'],
:reblogged_root_post_id => root_post_id)
post.save!
# Handle notes
root_post_id = post.id if root_post_id == 0
raw['notes'] ||= []
raw['notes'].each do |note|
if note['type'] == 'reblog'
note_blog_name = Blog.canonical_name(note['blog_name'])
note_blog = Blog.find_or_create_by_name(note_blog_name, :url => note['blog_url'])
note_blog.save!
note_post = Post.find_or_create_by_blog_id_and_reblogged_root_post_id(
note_blog.id,
root_post_id)
note_post.save!
elsif note['type'] == 'like'
# Currently unhandled
elsif note['type'] == 'posted'
# Currently unhandled
elsif note['type'] == 'note_model'
# Currently unhandled
elsif not note['type'].nil?
puts "Unhandled note type #{note['type']}"
pp note
raise "Wha?"
# $stderr.puts "Unhandled note type #{note['type']}"
end
end
rescue
pp raw
raise
end
end
offset += raw_posts.length
end
puts
end