-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprocess_udayton_temp_single_step.py
232 lines (195 loc) · 8.72 KB
/
process_udayton_temp_single_step.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import copy
import datetime
import json
import os
import random
from tqdm import tqdm
task_type_prompt = "Task type: time series classfication"
# [DATE-1]: 包含日期,也可以包含星期几
template_1 = {
"input": """<query>
Between [TIME-POINT-1] and [TIME-POINT-2], the mean daily temperatures recorded in [REGION] were <ts-data-blank-sep> (F) with -99 indicating missing data points. What is the projected temperature on [TIME-POINT-3]?
<response>
""",
"target": """The anticipated temperature will be [TARGET] degrees."""
}
list_templates_by_chatgpt = [
{
"input": """<query>
Between [TIME-POINT-1] and [TIME-POINT-2], the mean daily temperatures recorded in [REGION] amounted to <ts-data-blank-sep> (degrees). What is the projected temperature for [TIME-POINT-3]?
<response>
""",
"target": "The anticipated temperature will be [TARGET] degrees.",
},
{
"input": """<query>
For the time span covering [TIME-POINT-1] to [TIME-POINT-2], [REGION] experienced an average daily temperature of <ts-data-comma-sep> (F) with -99 denoting unavailable data points. What is the forecasted temperature on [TIME-POINT-3]?
<response>
""",
"target": "The forecasted temperature on [TIME-POINT-3] will be [TARGET] degrees.",
},
{
"input": """<query>
Considering the period from [TIME-POINT-1] to [TIME-POINT-2], the average daily temperatures observed in [REGION] amounted to <ts-data-blank-sep> (F) with -99 representing missing data. What is the expected temperature on [TIME-POINT-3]?
<response>
""",
"target": "The expected temperature will be [TARGET] degrees.",
},
{
"input": """<query>
从[TIME-POINT-1]到[TIME-POINT-2],[REGION]地区的平均每日温度为<ts-data-blank-sep>(华氏度)。-99表示缺失值。请问在[TIME-POINT-3]的温度将是多少?
<response>
""",
"target": "温度将会是[TARGET]度。",
},
{
"input": """<query>
[TIME-POINT-1]至[TIME-POINT-2]期间,[REGION]地区的平均每日温度为<ts-data-blank-sep>(华氏度)。-99代表有缺失值。请问在[TIME-POINT-3]时的温度会是多少?
<response>
""",
"target": "温度会是[TARGET]度。",
},
]
list_templates = []
list_templates.append(template_1)
list_templates.extend(list_templates_by_chatgpt)
task_name = "udayton_temperature"
to_folder = os.path.join(
"datasets/prompt_datasets", f"{task_name}"
)
os.makedirs(to_folder, exist_ok=True)
location_id2name = json.load(
open("datasets/udayton_temperature_archive/location_name.json", "r", encoding="utf-8")
)
window_sizes = [15, 30, 60]
horizon_sizes = [1, 2, 3, 7]
month_id2name = {
"1": "January",
"2": "February",
"3": "March",
"4": "April",
"5": "May",
"6": "June",
"7": "July",
"8": "August",
"9": "September",
"10": "October",
"11": "November",
"12": "December",
}
weekday_idx2name = {
"1": "Monday",
"2": "Tuesday",
"3": "Wednesday",
"4": "Thursday",
"5": "Friday",
"6": "Saturday",
"7": "Sunday",
}
for location_idx, location_name in tqdm(location_id2name.items()):
location_name_str = ", ".join(location_name)
list_time_points = []
with open(f"datasets/udayton_temperature_archive/allsites/{location_idx}.txt", "r", encoding="utf-8") as f:
for row in f:
row = row.strip()
if not row:
continue
# print(row.split(" "))
row = row.split(" ")
row = [w.strip() for w in row if len(w.strip()) > 0]
# print(row)
list_time_points.append(row)
for win_size in window_sizes:
for horizon_size in horizon_sizes:
if win_size < 40 and horizon_size > 3:
continue
list_samples = []
for start_point in tqdm(range(len(list_time_points) - win_size - horizon_size)):
end_point = start_point + win_size - 1
horizon_point = start_point + win_size - 1 + horizon_size
# 表达时间: May 27, 2019, Monday; May 27, 2019; 2019-05-27
start_time_info = list_time_points[start_point][: 3]
end_time_info = list_time_points[end_point][: 3]
horizon_time_info = list_time_points[horizon_point][: 3]
list_time_strs = []
for time_info in [start_time_info, end_time_info, horizon_time_info]:
s_month, s_day, s_year = time_info
start_time_point_1 = f"{month_id2name[s_month]} {s_day}, {s_year}"
start_time_point_2 = f"{month_id2name[s_month][: 3]} {s_day}, {s_year}"
s_month = "0" + s_month if len(s_month) == 1 else s_month
s_day = "0" + s_day if len(s_day) == 1 else s_day
s_date = datetime.date.fromisoformat(f"{s_year}-{s_month}-{s_day}")
start_time_point_3 = s_date.strftime("%Y-%m-%d")
weekday_idx = s_date.isoweekday()
weekday = weekday_idx2name[str(weekday_idx)]
# print(weekday)
start_time_point_4 = f"{weekday}, " + start_time_point_1
start_time_point_5 = f"{weekday}, " + start_time_point_2
start_time_point_6 = f"{weekday}, " + start_time_point_3
list_time_strs.append(
[
start_time_point_1, start_time_point_2, start_time_point_3,
start_time_point_4, start_time_point_5, start_time_point_6,
]
)
tmp_idx = random.choice(list(range(6)))
start_time_str = list_time_strs[0][tmp_idx]
end_time_str = list_time_strs[1][tmp_idx]
horizon_time_str = list_time_strs[2][tmp_idx]
# print("start_time_str: ", start_time_str)
# print("end_time_str: ", end_time_str)
# print("horizon_time_str: ", horizon_time_str)
ts_data_ = [w[-1] for w in list_time_points[start_point: end_point + 1]]
horizon_data_ = list_time_points[horizon_point][-1]
ts_data_blank_sep = " ".join(ts_data_)
ts_data_comma_sep = ",".join(ts_data_)
for template_ in list_templates:
template_ = copy.copy(template_)
# print(template_)
query_template = template_["input"]
response_template = template_["target"]
'''
From [TIME-POINT-1] to [TIME-POINT-2], the average daily temperatures of [REGION] was <ts-data-blank-sep> (degree). What is the temperature going to be on [TIME-POINT-3]?
'''
# 确认数据怎么分割的
if "<ts-data-comma-sep>" in query_template:
query_template = query_template.replace(
r"<ts-data-comma-sep>",
ts_data_comma_sep
)
else:
query_template = query_template.replace(
r"<ts-data-blank-sep>",
ts_data_blank_sep
)
query_template = query_template.replace(
"[TIME-POINT-1]", start_time_str
).replace(
"[TIME-POINT-2]", end_time_str
).replace(
"[TIME-POINT-3]", horizon_time_str
).replace(
"[REGION]", location_name_str
)
assert "<ts-data-comma-sep>" not in query_template
assert "<ts-data-blank-sep>" not in query_template
# 确认中文还是英文
response_template = response_template.replace(
"[TARGET]", horizon_data_
).replace(
"[TIME-POINT-3]", horizon_time_str
)
list_samples.append(
{
"original_data": (ts_data_, horizon_data_),
"target": response_template,
"input": query_template,
"task_name": task_name,
"task_type": "forecast",
}
)
with open(os.path.join(to_folder, f"{location_idx}_{win_size}_{horizon_size}.json"), "w", encoding="utf-8") as f:
for samp in list_samples:
f.write(
json.dumps(samp, ensure_ascii=False) + "\n"
)