-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtract_combine.py
102 lines (80 loc) · 2.96 KB
/
Extract_combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from Plot_AQI import avg_data_2013,avg_data_2014,avg_data_2015,avg_data_2016
import requests
import sys
import pandas as pd
from bs4 import BeautifulSoup
import os
import csv
def met_data(month, year):
file_html = open('Data/Html_Data/{}/{}.html'.format(year,month), 'rb')
plain_text = file_html.read()
tempD = []
finalD = []
soup = BeautifulSoup(plain_text, "lxml")
for table in soup.findAll('table', {'class': 'medias mensuales numspan'}):
for tbody in table:
for tr in tbody:
a = tr.get_text()
tempD.append(a)
rows = len(tempD) / 15
for times in range(round(rows)):
newtempD = []
for i in range(15):
newtempD.append(tempD[0])
tempD.pop(0)
finalD.append(newtempD)
length = len(finalD)
finalD.pop(length - 1)
finalD.pop(0)
for a in range(len(finalD)):
finalD[a].pop(6)
finalD[a].pop(13)
finalD[a].pop(12)
finalD[a].pop(11)
finalD[a].pop(10)
finalD[a].pop(9)
finalD[a].pop(0)
return finalD
def data_combine(year, cs):
for a in pd.read_csv('Data/Real-Data/real_' + str(year) + '.csv', chunksize=cs):
df = pd.DataFrame(data=a)
mylist = df.values.tolist()
return mylist
if __name__ == "__main__":
if not os.path.exists("Data/Real-Data"):
os.makedirs("Data/Real-Data")
for year in range(2013, 2017):
final_data = []
with open('Data/Real-Data/real_' + str(year) + '.csv', 'w') as csvfile:
wr = csv.writer(csvfile, dialect='excel')
wr.writerow(
['T', 'TM', 'Tm', 'SLP', 'H', 'VV', 'V', 'VM', 'PM 2.5'])
for month in range(1, 13):
temp = met_data(month, year)
final_data = final_data + temp
pm = getattr(sys.modules[__name__], 'avg_data_{}'.format(year))()
if len(pm) == 364:
pm.insert(364, '-')
for i in range(len(final_data)-1):
# final[i].insert(0, i + 1)
final_data[i].insert(8, pm[i])
with open('Data/Real-Data/real_' + str(year) + '.csv', 'a') as csvfile:
wr = csv.writer(csvfile, dialect='excel')
for row in final_data:
flag = 0
for elem in row:
if elem == "" or elem == "-":
flag = 1
if flag != 1:
wr.writerow(row)
data_2013 = data_combine(2013, 600)
data_2014 = data_combine(2014, 600)
data_2015 = data_combine(2015, 600)
data_2016 = data_combine(2016, 600)
total=data_2013+data_2014+data_2015+data_2016
with open('Data/Real-Data/Real_Combine.csv', 'w') as csvfile:
wr = csv.writer(csvfile, dialect='excel')
wr.writerow(
['T', 'TM', 'Tm', 'SLP', 'H', 'VV', 'V', 'VM', 'PM 2.5'])
wr.writerows(total)
df=pd.read_csv('Data/Real-Data/Real_Combine.csv')