-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathold.txt
78 lines (68 loc) · 2.66 KB
/
old.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
from bs4 import BeautifulSoup
import re
def find_emails(soup, base_url, emails_found):
"""
Find and store emails found in the soup object.
"""
email_pattern = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
emails = email_pattern.findall(str(soup))
for email in set(emails): # Use set to avoid duplicates
if email not in emails_found:
emails_found[email] = base_url
def normalize_url(url):
"""
Normalize a URL by removing redundant double slashes.
"""
parts = url.split("://", 1) # Split once on "://"
normalized = parts[0] + "://" + parts[1].replace("//", "/")
return normalized
def find_links(soup, base_url, visited_urls):
"""
Find all links in the soup object and return them.
"""
links = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('/'):
href = base_url + href
href = normalize_url(href)
if href.startswith(base_url) and href not in visited_urls:
links.append(href)
return links
def scrape_site(url, visited_urls={}, emails_found={}, depth=0, max_depth=3):
"""
Scrape the site recursively for emails and subpage links.
"""
if url in visited_urls or depth > max_depth:
return
print(f"Scraping {url} at depth {depth}")
# visited_urls[url] = True
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers,allow_redirects = True)
final_url = normalize_url(response.url)
if final_url in visited_urls:
return
visited_urls[final_url] = True
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
find_emails(soup, url, emails_found)
for link in find_links(soup, url, visited_urls):
scrape_site(link, visited_urls, emails_found, depth+1, max_depth)
except requests.RequestException as e:
print(f"Failed to make a request to {url}: {e}")
def main(start_url, max_depth=3):
visited_urls = {}
emails_found = {}
scrape_site(start_url, visited_urls, emails_found, 0, max_depth)
# Writing emails to a file
with open('emails_found.txt', 'w') as file:
for email, page in emails_found.items():
file.write(f"Found on: {page}\n{email}\n\n")
if __name__ == "__main__":
start_url = "https://capfirm.com/" #input("Enter the URL to scrape: ")
max_depth = 3 #int(input("Enter the maximum depth to scrape: "))
main(start_url, max_depth)