-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall_links.py
29 lines (26 loc) · 847 Bytes
/
all_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#procedure to download web html content
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ""
#procedure to find and extract the first href link in 'page', along with the next position in the text
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1: end_quote]
return url, end_quote
#procedure to iterate thru all links in 'page'
def print_all_links(page):
while True:
url, endpos = get_next_target(page)
if url:
print url
page = page[endpos:]
else:
break
print_all_links(get_page('http://dannyroosevelt.com'))