-
Notifications
You must be signed in to change notification settings - Fork 0
/
CFH图片库.py
83 lines (69 loc) · 3.57 KB
/
CFH图片库.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import os
import requests
import re
import random
#作业从专业网站爬图片/从百度网站爬图片
#chris 4/21 仅供学习参考
class TryDownload:
def __init__(self,download_file,key_word,download_max):
self.download_num=0#当前下载数量
self.download_max=download_max#目标下载数量
self.key_word=key_word
self.download_path='./download/cfh/' + str(download_file)
def do_download(self):
url1='http://www.cfh.ac.cn/ajaxserver/speciesserv.ashx?action=spsearchzh&keyword='+ self.key_word
header={
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) '
'Gecko/20100101 Firefox/66.0',
'Host':'www.cfh.ac.cn',
'Cookie': 'Hm_lvt_17100a428da6da3b4e5da32712ca72c3=1555421469; '
'Hm_lpvt_17100a428da6da3b4e5da32712ca72c3=1555421566; '
'CFH_Cookie=sk3fvuvpywqj41uwnncntajr',},
{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'},
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
}
raw_data=requests.get(url1,headers=random.choice(header))
#print(raw_data.text)
#已经得到获取ID的API网址
json_data=json.loads(raw_data.text)
#print(json_data)
if(len(json_data))>0:
key_id=json_data[0]['ID']
url2="http://www.cfh.ac.cn/AjaxServer/Server.ashx?service=photoset&method=get&spid="+str(key_id)+"&pagesize="+str(self.download_max)+"&page=1"
list2=requests.get(url2,headers=header)
data=json.loads(list2.text)['photolist']
#已经可以得到所有照片的地址,接下来就是存储工作
self.download_num=0
if not os.path.exists(self.download_path):
os.makedirs(self.download_path)
while self.download_num<self.download_max:
img_url='http://www.cfh.ac.cn'+ \
str(data[self.download_num]['thumbnail']).replace('Thumbnail','Normal')
print('正在下载第 '+str(self.download_num+1)+' 张图片,图片地址为 '+str(img_url))
try:
pic=requests.get(img_url,timeout=10*random.randint(1,10))
pic_name=self.download_path+'/'+str(self.download_num+1)+'.jpg'
with open(pic_name,'wb') as f:
f.write(pic.content)
self.download_num+=1
if(self.download_num >= self.download_max):
break
except Exception as err:
self.download_num+=1
print(err)
continue
print("下载完成")
if __name__== '__main__':
import csv
#读取与本文件位于相同目录下的namelist.csv文件并根据需求进行爬取
name=open('namelist.csv','r')
rawd=csv.reader(name)
for row in rawd:
print(row)
test=TryDownload(row[0],row[1],float(row[2]))
test.do_download()
#test=TryDownload("十大功劳","十大功劳",100)
#test.do_download()