-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
46 lines (39 loc) · 1.25 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from bs4 import BeautifulSoup
import requests
import sys
import urllib2
import re
import ConfigParser
import os
url = 'http://www.bahianoticias.com.br/saude/artigos.html'
def buscar(url):
lista = []
urlocal = 'http://www.bahianoticias.com.br'
contador = ConfigParser.ConfigParser()
contador.read(['contador.ini'])
i = contador.getint('DEFAULT','contador')
path = os.path.dirname(__file__)
print(path)
r = requests.get(url)
soup = BeautifulSoup(r.text)
for a in soup.findAll('a', {'class': 'btn-default'}):
link = a['href']
html = urllib2.urlopen(urlocal + link).read()
i+=1
soup = BeautifulSoup(html)
for p in soup.findAll('div', {'class':'text-descricao'}):
regex = re.compile(r'<[^<]*?>')
descricao = regex.sub('', str(p))
arquivo = open('arquivos/artigo'+str(i)+'.txt','w')
arquivo.write(descricao)
arquivo.close
print '============================'
print '\n' + descricao
contador.set('DEFAULT','contador', i)
with open('contador.ini','w') as config:
contador.write(config)
print "\n>> Total = %d \n " % i
sys.exit()
for arg in sys.argv:
if arg == "-l":
buscar(url)