-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrabgrassbk.py
76 lines (62 loc) · 2.45 KB
/
crabgrassbk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import os
import shutil
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from lib.crawler import Crawler
from lib.config import Config
from lib.zip import Zip
def init_driver(config_instance: Config):
drv = webdriver.PhantomJS(executable_path=config_instance.phantomjs_bin_path)
drv.wait = WebDriverWait(drv, 1)
return drv
if __name__ == "__main__":
# init config object and driver
config = Config('conf/config.yml')
driver = init_driver(config)
crawler = Crawler(driver, config)
print("====Driver logging in====")
crawler.login()
print("====Driver logged in====")
print("====Starting backup process====")
# head driver to pages section
crawler.goto_pages()
time.sleep(2)
# create folder for backups if it doesnt exist
current_datetime = time.strftime('%d%m%Y%H%M%S')
backup_dir_for_now = config.dir_backup + os.path.sep + current_datetime
if not os.path.exists(config.dir_backup):
os.makedirs(config.dir_backup)
if not os.path.exists(backup_dir_for_now):
os.makedirs(backup_dir_for_now)
# navigate through paginated results and retrieve all page links to fetch
links_to_crawl = crawler.get_all_created_pages_links()
total_links = len(links_to_crawl)
# fetch pages
max_iterations = config.max_iterations_in_demo_mode
counter = 0
for link in links_to_crawl:
if config.demo_mode and counter == max_iterations:
print("====DEMO MODE. Stopping at 5 links====")
break
crawler.crawl_link(link, backup_dir_for_now)
print("Processed link ", str(counter), " of ", str(total_links))
counter += 1
print("====Backup process finished====")
time.sleep(1)
# logout driver
print("====Driver logging out====")
crawler.logout()
print("====Driver logged out====")
# close driver connection
driver.quit()
# dump backup in a zip file
print("====Packing backup in ", backup_dir_for_now + os.extsep + 'zip', " ====")
Zip.zipdir(backup_dir_for_now, backup_dir_for_now + os.extsep + Zip.ZIP_EXTENSION)
print("====Packed backup====")
# delete backup folder and leave only zip file in backup root folder
print("====Deleting backup source folder ", backup_dir_for_now, " ====")
shutil.rmtree(backup_dir_for_now)
print("====Deleted backup source folder ", backup_dir_for_now, " ====")