forked from needmorecowbell/Hamburglar
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtableScraper.py
55 lines (40 loc) · 1.26 KB
/
tableScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
import lxml.html as lh
import pandas as pd
import sqlalchemy as db
import re
url='https://en.wikipedia.org/wiki/List_of_file_signatures'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')
#Create empty list
col = []
i = 0
for t in tr_elements[0]:
i += 1
name = t.text_content().strip()
print(i, name)
if name != "":
col.append((name, []))
for j in range(1, len(tr_elements)):
T = tr_elements[j]
i = 0
for t in T.iterchildren():
data = t.text_content().strip()
if i > 0:
try:
data = int(data)
except:
pass
col[i][1].append(data)
i += 1
Dict = {title:column for (title, column) in col}
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in Dict.items()]))
engine = db.create_engine('mysql+pymysql://root:toorroot@localhost/fileSign')
conn = engine.connect()
df.to_sql("signatures", engine, if_exists='replace')
db_string = engine.execute("SELECT `Hex signature` FROM `signatures` WHERE signatures.index=0").fetchall()
print(db_string)