-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatabase.py
136 lines (127 loc) · 3.65 KB
/
database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
* Responsibility:
Provide functions for crawler and parser to pertaining to any data persistance task.
- Why a database instead of a simpler csv file? (CSV is very popular with data science related projects)
Because I wanted to make the scraper handle errors gracefully.
I wanted the stacktrace to be stored somewhere when crawling or parsing fails so that I can review later.
I wanted the crawler to skip successfully crawled pages.
I also wanted to do housekeeping like track insertion datetime.
"""
import sqlite3
from time import gmtime, strftime
class Database(object):
def __init__(self):
self.conn = sqlite3.connect('fraiser.db')
self.create_tables()
def create_tables(self):
sqls = [
"""
CREATE TABLE IF NOT EXISTS history
(
url TEXT,
detail TEXT,
status TEXT,
timestamp TEXT
);
""",
"""
CREATE TABLE IF NOT EXISTS episode
(
url TEXT,
season INTEGER,
episode INTEGER,
title TEXT,
transcript_written_date TEXT,
transcript_revised_date TEXT,
aired_date TEXT,
writers TEXT,
directors TEXT,
timestamp TEXT
);
""",
"""
CREATE TABLE IF NOT EXISTS script
(
url TEXT,
cast TEXT,
dialog TEXT,
timestamp TEXT
);
"""
]
for sql in sqls:
self.conn.execute(sql)
self.conn.commit()
def save_history(self, url, detail, status):
sql = """
INSERT INTO history VALUES (
?,
?,
?,
?
)
"""
self.conn.execute(sql, (
url,
detail,
status,
strftime("%Y-%m-%d %H:%M:%S", gmtime())
))
self.conn.commit()
def check_history(self, url, status):
sql = """
SELECT * FROM history
WHERE
url = ?
and status = ?
"""
cur = self.conn.cursor()
cur.execute(sql, (
url,
status
))
return cur.fetchall()
def save_episode_info(self, url, info):
sql = """
INSERT INTO episode VALUES (
?,
?,
?,
?,
?,
?,
?,
?,
?,
?
)
"""
self.conn.execute(sql, (
url,
info['season'],
info['episode'],
info['title'],
info['written_date'],
info['revised_date'],
info['aired_date'],
info['writers'],
info['director'],
strftime("%Y-%m-%d %H:%M:%S", gmtime())
))
self.conn.commit()
def save_script(self, url, cast, dialog):
sql = """
INSERT INTO script VALUES (
?,
?,
?,
?
)
"""
self.conn.execute(sql, (
url,
cast,
dialog,
strftime("%Y-%m-%d %H:%M:%S", gmtime())
))
self.conn.commit()