-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathscrapePdfUrl.py
76 lines (61 loc) · 2.48 KB
/
scrapePdfUrl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding: utf-8 -*-
# importing modules
import requests
import BeautifulSoup
import os
import datetime
import csv
# variables
today = datetime.datetime.now().strftime("%Y%m%d")
searchParam = 'pablo de camargo cerdeira'
urlParam = 'http://doweb.rio.rj.gov.br/buscaweb/' \
'search?q={0}&f=100&px=10&t=1&p={1}'
htmParam = 'http://doweb.rio.rj.gov.br/do/navegadorhtml/' \
'load_tree.php?edi_id={0}'
def scrapeSearchResults(searchArg):
'''
returns the URLs containing the search parameter
'''
allResults = []
searchPage = 1
while True:
response = requests.get(urlParam.format(searchArg, searchPage))
soup = BeautifulSoup.BeautifulSoup(response.content)
pageResults = [i.get('href') for i in soup.findAll('a') \
if i.get('href').find('search=' + searchArg) > 0]
if len(pageResults) > 0:
allResults += pageResults
searchPage += 1
else:
break
# cria lista com dicionários com URLs da busca e respectivos parametros
resultDicts = []
urlId = 0
for url in allResults:
urlId += 1
resultDict = {}
resultDict['id'] = urlId
# resultDict['search_url'] = url
resultDict['do_edicao'] = url[url.find('edi_id=') + \
len('edi_id='):url.find('&page=')]
resultDict['do_pagina'] = url[url.find('&page=') + \
len('&page='):url.find('&search=')]
resultDict['pdf_url'] = 'http://doweb.rio.rj.gov.br/' \
'ler_pdf.php?edi_id={0}&page={1}'.format(\
resultDict['do_edicao'],
resultDict['do_pagina'])
resultDicts.append(resultDict)
with open(searchParam + '.csv', 'w') as csvfile:
fieldnames = ['id', 'do_edicao', 'do_pagina', 'pdf_url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in resultDicts:
writer.writerow(result)
#
ediParam = resultDicts[0]['do_edicao']
response = requests.get(htmParam.format(ediParam))
soup = BeautifulSoup.BeautifulSoup(response.content)
pageResults = [i.get('href') for i in soup.findAll('a') \
if i.get('href').find('search=' + searchArg) > 0]
return True
scrapeSearchResults(searchParam)