-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpost.py
148 lines (135 loc) · 5.85 KB
/
post.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
post.py
~~~~~~~~~~~~~~~~~
Crawl post content
"""
import requests
import json
from bs4 import BeautifulSoup
from config import *
from database import *
from gadget import *
def getContent(page,latestTimestamp,conn):
'''get post content and save in database'''
# convert timestamp to date format
timeLocal = getDate(latestTimestamp)
print("=====开始抓取时间[%s]之后的微博=====" % timeLocal)
print("=====开始抓取第%s页之后的微博=====" % page)
url = listUrlFormat.format(page)
data = requests.get(url, headers = headers)
data.encoding = 'utf-8'
data = json.loads(data.text)
# use to record fail time
breakCount = 0
# Process all post items
for content in data['data']['cards']:
if('mblog' in content.keys()):
addTime = getTimestamp(content['mblog']['created_at'])
print('添加时间%s' % addTime)
# Crawl time between A and B,use:
# if addTime > 1528905600 and addTime < 1529421596:
# Crawl time after latest timestamp
if addTime > latestTimestamp:
print('POST ID: %s 开始抓取\n' % content['mblog']['id'])
kwPost = {}
kwPost['add_time'] = addTime
kwPost['post_id'] = content['mblog']['id']
kwPost['attitudes_count'] = content['mblog']['attitudes_count']
kwPost['comments_count'] = content['mblog']['comments_count']
# if there is not long text, use the current text,otherwise get the long text
if content['mblog']['isLongText'] == False:
kwPost['content'] = content['mblog']['text']
else:
kwPost['content'] = getLongTextContent(content['mblog']['id'])
# if fail to get long text, just use the short one instead
if kwPost['content'] == False:
kwPost['content'] = content['mblog']['text']
# if has retweeted content, get it in the same way
if 'retweeted_status' in content['mblog']:
kwPost['retweet_id'] = content['mblog']['retweeted_status']['id']
# if there is not long text, use the current text,otherwise get the long text
if content['mblog']['retweeted_status']['isLongText'] == False:
kwPost['retweet_content'] = content['mblog']['retweeted_status']['text']
else:
kwPost['retweet_content'] = getLongTextContent(content['mblog']['retweeted_status']['id'])
# if fail to get long text, use the short one instead
if kwPost['retweet_content'] == False:
kwPost['retweet_content'] = content['mblog']['retweeted_status']['text']
# Insert post data into database
insert_data('wb_mzm_post',conn,**kwPost)
print('POST ID: %s 写入成功\n' % kwPost['post_id'])
# Start getting comments of post
for data in getComment(kwPost['post_id'],0):
if data:
kwComment = {}
kwComment['post_id'] = kwPost['post_id']
kwComment['comment_id'] = data[0]
kwComment['like_count'] = data[1]
kwComment['add_time'] = data[2]
kwComment['comment_user_id'] = data[3]
kwComment['comment_user_name'] = data[4]
kwComment['user_photo'] = data[5]
kwComment['user_profile'] = data[6]
kwComment['text'] = data[7]
kwComment['reply_text'] = data[8]
insert_data('wb_mzm_comment', conn, **kwComment)
print('POST ID:%s 的评论写入完毕\n' % kwPost['post_id'])
else:
print('已抓取,不写入')
breakCount = breakCount + 1
# if reach the limit fail times,stop
if breakCount == 5:
# close database connection
closeConn(conn)
print('已经抓取完毕,程序结束...')
exit()
print('=====第%s页抓取完毕=====\n' % page)
def getLongTextContent(id):
url = postUrlFormat.format(id)
data = requests.get(url, headers = headers)
if data and '打开微博客户端' not in data.text:
data.encoding = 'utf-8'
data = json.loads(data.text)
return data['data']['longTextContent']
else:
return False
def getComment(id,page):
'''get all comments of a post page by page recursively and return a generator'''
url = commentUrlFormat.format(id = id, page = page)
data = requests.get(url, headers = headers)
data.encoding = 'utf-8'
data = json.loads(data.text)
# while return data normally
while data['ok'] == 1 and 'data' in data.keys() and page < data['data']['max']:
for content in data['data']['data']:
addTime = getTimestamp(content['created_at'])
# the comment has both replay text and text
if 'reply_text' in content.keys() and 'text' in content.keys():
yield [content['id'], content['like_counts'], addTime, content['user']['id'], content['user']['screen_name'], content['user']['profile_image_url'], content['user']['profile_url'], content['text'], content['reply_text']]
# the comment only has reply text
elif 'reply_text' in content.keys() and 'text' not in content.keys():
yield [content['id'], content['like_counts'], addTime, content['user']['id'], content['user']['screen_name'], content['user']['profile_image_url'], content['user']['profile_url'], '', content['reply_text']]
# the comment only has text
elif 'reply_text' not in content.keys() and 'text' in content.keys():
yield [content['id'], content['like_counts'], addTime, content['user']['id'], content['user']['screen_name'], content['user']['profile_image_url'], content['user']['profile_url'], content['text'], '']
# print('抓取评论第%s页\n' % page)
page = page + 1
url = commentUrlFormat.format(id = id, page = page)
data = requests.get(url, headers = headers)
data.encoding = 'utf-8'
data = json.loads(data.text)
if __name__ == '__main__':
conn = db_connector()
latestTimestamp = selectData(conn,'wb_mzm_post',3)
if latestTimestamp == None:
latestTimestamp = 0
saveLastTimestamp(latestTimestamp,'last_post_timestamp.txt')
print('上次更新到:%s' % getDate(latestTimestamp))
postPage = 1
# the program would exit while all latest posts are crawled,the break point locate in getContent()
while True:
getContent(postPage,latestTimestamp,conn)
postPage = postPage + 1
sleepTimes(3)