-
Notifications
You must be signed in to change notification settings - Fork 2
/
s1.py
62 lines (55 loc) · 2.11 KB
/
s1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from utils import parser
from utils import upload_img
from utils import html2markdown
from utils import generator
import sys
def lz_only(url):
suffix = parser(url).find_all(class_='authi')[1].find('a')['href']
return f'https://bbs.saraba1st.com/2b/{suffix}'
def get_posts(url):
def get_post(page_url):
return parser(page_url).find_all(class_='t_fsz')
# page = parser(url).find(id='fj').find('input')['size']
# page = parser(url).find_all(class_='.pg')[0].find_all('label')[0].find_all('span')[0].title
# page = re.findall(r'共 (.+?) 页', page)[0]
posts = []
# for i in range(1, int(page) + 1):
# posts += [x for x in get_post(url[:-8] + str(i) + '-1.html')]
content = ''
posts = get_post(url)
img_src = r"""\bsrc\b\s*=\s*[\'\"]?([^\'\"]*)[\'\"]?"""
for post in posts:
try:
post.find(class_='pstatus').clear() # 移除 本贴最后……
post.find(class_='tip').clear() # 移除 附件:……
except AttributeError:
pass
post = str(post)
post = re.sub('<div[^>]*>', '<p>', post)
post = re.sub('<\/div[^>]*>', '</p>', post)
post = post.replace('file="', 'src="') # 懒加载
for img in re.findall(img_src, post):
new_img = upload_img(img)
post = post.replace(img, new_img)
content += html2markdown(post)
return content
def get_meta(url):
meta = {}
r = parser(url)
meta['title'] = r.find(id='thread_subject').text
tag = r'\[.*?\]|【.*?】' # 去除【】[] 包裹的内容
meta['title'] = re.sub(tag, '', meta['title'])
meta['author'] = r.find(class_='authi').find('a', class_='xw1').text
meta['original'] = url
return meta
def get_date(url):
return parser(url).find_all(class_='authi')[1].find('em').text[4:13]
def s1_spider(id):
url = f'https://bbs.saraba1st.com/2b/thread-{id}-1-1.html'
lz_url = lz_only(url)
generator('Stage1', get_meta(lz_url), get_posts(lz_url), get_date(lz_url))
if __name__ == '__main__':
s1_spider(str(sys.argv[1]))