-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
35 lines (35 loc) · 1.1 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib, mechanize, re, os
br=mechanize.Browser()
logfile=open('logfile.txt','rb')
GLOBALDIR='C&HComics'
if not os.path.exists(GLOBALDIR):
os.makedirs(GLOBALDIR)
os.chdir(GLOBALDIR)
def getcomic(path):
if path.split('/')[-2]=='15':
return
print "Reading link "+path+" ..."
source=br.open(path).read()
imgpath=re.search('http://www.explosm.net/db/files/Comics/[-\w/]+\.(?:jpg|gif|png)', source)
if imgpath:
print "Found image path "+imgpath.group(0)
parts=imgpath.group(0).split('/')
if not parts[-2]=='Comics':
if not os.path.exists(parts[-2]):
os.makedirs(parts[-2])
pathname=parts[-2]+'/'+path.split('/')[-2]+'_'+parts[-1]
else:
pathname=path.split('/')[-2]+'_'+parts[-1]
if not os.path.exists(pathname):
file=urllib.urlopen(imgpath.group(0)).read()
outputfile=open(pathname,'wb')
outputfile.write(file)
outputfile.close
rs=br.follow_link(text_regex=r"Previous")
print "Found next link "+rs.geturl()
getcomic(rs.geturl())
lastlink=re.findall('http://explosm.net/comics/[0-9]+/',logfile.read())
if lastlink:
getcomic(lastlink[-1])
else:
getcomic("http://explosm.net/comics")