old.txt

import requests
from bs4 import BeautifulSoup
import re

def find_emails(soup, base_url, emails_found):
    """
    Find and store emails found in the soup object.
    """
    email_pattern = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
    emails = email_pattern.findall(str(soup))
    for email in set(emails):  # Use set to avoid duplicates
        if email not in emails_found:
            emails_found[email] = base_url

def normalize_url(url):
  """
  Normalize a URL by removing redundant double slashes.
  """
  parts = url.split("://", 1)  # Split once on "://"
  normalized = parts[0] + "://" + parts[1].replace("//", "/")
  return normalized


def find_links(soup, base_url, visited_urls):
    """
    Find all links in the soup object and return them.
    """
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/'):
            href = base_url + href
            href = normalize_url(href)
        if href.startswith(base_url) and href not in visited_urls:
            links.append(href)
    return links

def scrape_site(url, visited_urls={}, emails_found={}, depth=0, max_depth=3):
    """
    Scrape the site recursively for emails and subpage links.
    """
    if url in visited_urls or depth > max_depth:
        return
    print(f"Scraping {url} at depth {depth}")
   # visited_urls[url] = True
    try:
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
      }

        response = requests.get(url, headers=headers,allow_redirects = True)
        final_url = normalize_url(response.url)

        if final_url in visited_urls:
          return
        visited_urls[final_url] = True

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            find_emails(soup, url, emails_found)
            for link in find_links(soup, url, visited_urls):
                scrape_site(link, visited_urls, emails_found, depth+1, max_depth)
    except requests.RequestException as e:
        print(f"Failed to make a request to {url}: {e}")

def main(start_url, max_depth=3):
    visited_urls = {}
    emails_found = {}
    scrape_site(start_url, visited_urls, emails_found, 0, max_depth)
    # Writing emails to a file
    with open('emails_found.txt', 'w') as file:
        for email, page in emails_found.items():
            file.write(f"Found on: {page}\n{email}\n\n")

if __name__ == "__main__":
    start_url = "https://capfirm.com/" #input("Enter the URL to scrape: ")
    max_depth = 3 #int(input("Enter the maximum depth to scrape: "))
    main(start_url, max_depth)