-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgetPapers.py
130 lines (110 loc) · 3.46 KB
/
getPapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
'''
@Descripttion: find papers whose title contains keywords
@version:
@Author: wangshuo
@Date: 2020-06-29 09:45:06
@LastEditors: wangshuo
@LastEditTime: 2020-07-28 10:58:48
'''
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import os
# CVPR ICCV ECCV WACV
CONFERENCE = 'WACV2022'
# 关键字可以添加多个
KEYWORDS = ['few-shot', 'few']
# 是否下载
DOWNLOAD_FLAG = True
# 跳转地址前缀
PREFIX = 'https://openaccess.thecvf.com/'
URL_BASE = PREFIX + CONFERENCE.upper()
# CVPR2019 CVPR2020 URL变化调整
DAY = []
if CONFERENCE.upper() == 'CVPR2020':
DAY = ['2020-06-16', '2020-06-17', '2020-06-18']
elif CONFERENCE.upper() == 'CVPR2019':
DAY = ['2019-06-18', '2019-06-19', '2019-06-20']
elif CONFERENCE.upper() == 'CVPR2021' or CONFERENCE.upper() == 'ICCV2021':
DAY = ['all']
if len(DAY) > 0:
URL = []
for day in DAY:
URL.append(URL_BASE + '?day=' + day)
else:
URL = [URL_BASE]
# 保存路径
SAVEPATH = 'papers_' + KEYWORDS[0] + '_' + CONFERENCE
# 保存文件前缀
FILEPREFIX = '['+ CONFERENCE +']'
# windows命名不包含以下字符
DICT_REP = {
':': ' ',
'?': ' ',
'<': ' ',
'>': ' ',
'|': ' '
}
def main():
# 创建文件夹
if not os.path.exists(SAVEPATH):
os.mkdir(SAVEPATH)
# 用于存放文章标题
titles_list = []
# 用于存放文章链接
links_list = []
# 计数器
papers_num = 0
# 请求HTML
for url in URL:
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
# 找到所有class为ptitle的标签
paper_titles = soup.find_all('dt', {"class": "ptitle"})
paper_all_nums = len(paper_titles)
for i in range(paper_all_nums):
paper_title = paper_titles[i].get_text()
# 遍历,是否包含关键字
for kw in KEYWORDS:
if paper_title.lower().find(kw) != -1:
papers_num += 1
titles_list.append(paper_title)
# 文件名
file_name = FILEPREFIX + rep(paper_title) + '.pdf'
# 保存路径
file_path = os.path.join(SAVEPATH, file_name)
# 文章链接
pdf_url = PREFIX + paper_titles[i].next_sibling.next_sibling.next_sibling.next_sibling.a['href']
links_list.append(pdf_url)
# 下载文章
if DOWNLOAD_FLAG:
r = requests.get(pdf_url)
print(papers_num, paper_title)
with open(file_path , 'wb') as f:
f.write(r.content)
# 避免重复
break
save_markdown(titles_list, links_list, nums=papers_num)
def rep(rawstr, dict_rep=DICT_REP):
'''
@Author: wangshuo
@description: replace strs in title
'''
for i in dict_rep:
rawstr = rawstr.replace(i, dict_rep[i])
return rawstr
def save_markdown(titles, links, nums):
'''
@Author: wangshuo
@description: generate a paper list
'''
content = ''
for i in range(nums):
line = ' ' + str(i+1) + '. [' + titles[i] + '](' + links[i] + ')' + '\n'
content += line
file_path = os.path.join(SAVEPATH, 'papers_list.md')
with open(file_path , 'w') as f:
f.write(content)
if __name__ == "__main__":
main()