-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
174 lines (124 loc) · 7.64 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
import csv
from collections import defaultdict
# Predefined dictionaries of row numbers (not computer indexed) with labels for each input file
input_file_1_lists = [
{"label": "no party", "rows": [2, 3, 4, 5, 102, 103, 105, 107, 110, 111, 112, 120, 123, 127, 2422, 2618, 2618, 2654,
2968, 3246, 3591, 3974, 4039, 4579, 5802, 7, 8, 9, 10, 11, 76, 78, 79, 81, 82, 83,
93, 94, 97, 98, 100, 101, 4406, 4705, 6007, 6515, 2064, 5619, 5632, 3245, 3251, 19,
22, 43, 44, 47, 48, 50, 52, 58, 61, 62, 65, 91, 3373, 6439, 5609, 1577, 2069, 2309,
3279, 4678, 2312, 6526, 25, 26, 27, 28, 30, 31, 33, 35, 36, 38, 40, 41, 42, 4428,
2262, 2683, 4096, 134, 139, 141, 142, 148, 154, 155, 349, 407, 2350, 3362, 9369,
3372, 3373, 3374, 3580, 3581, 6326, 4225, 4599, 4601, 6211, 6530, 6531, 272, 307,
330, 398, 401, 402, 445, 648, 649, 723, 1628, 2151, 2857, 3509, 3528, 3621, 4094,
4405]},
{"label": "RN", "rows": [23, 24, 39, 53, 70, 74, 89, 96, 104, 150, 3365, 6322, 6324, 6329, 3493, 1229, 1262, 1264,
1324, 915, 5623, 285, 291, 406, 458, 470, 497, 517, 535, 539, 555, 625, 643, 697, 799, 877,
966, 1016, 1080, 1081, 1089, 1109, 1203, 1300, 1341, 1410, 1449, 1549, 1636, 2139, 2152,
2153, 2177, 2185, 2189, 2304, 2335, 2339, 2370, 2375, 2382, 2511, 2535, 2541, 2600, 2849,
2867, 2886, 3090, 3100, 3226, 3242, 3249, 3264, 3269, 3381, 3498, 3565, 3872, 3936, 3940,
3996, 4148, 4329, 4338, 4344, 4582, 4666, 4671, 4777, 4834, 5087, 5132, 5339, 5495, 5543,
5681, 5801, 5872, 5972, 5936, 5963, 6036, 6091, 6109, 6193, 6402, 6499, 6578, 5375, 4945,
5843, 194, 572, 1092, 615, 1069, 3559, 411, 4677, 4688]},
{"label": "LFI", "rows": [18, 87, 116, 3572, 4820, 3680, 679, 4647, 4648, 329, 422, 586, 588, 653, 690, 691, 698,
701, 706, 969, 1070, 1197, 1315, 1323, 1382, 1407, 1612, 1813, 1842, 2182, 221, 2509,
2675, 2845, 2894, 2924, 3434, 3508, 3608, 3629, 3840, 3848, 4302, 4324, 4527, 4647, 4704,
4752, 5428, 6206, 6373, 6583, 6699, 915, 1121, 1276, 3997, 4084, 5463, 5716, 217, 2182,
4091, 4896, 5048, 1406, 2322, 1406]},
{"label": "LR", "rows": [32, 45, 49, 59, 63, 64, 71, 72, 75, 99, 3360, 3891, 4017, 6325, 4310, 289, 684, 974, 1004,
1012, 1238, 1243, 1296, 1515, 1533, 1537, 1759, 1961, 2234, 2257, 2358, 2363, 2365, 3233,
3346, 3356, 3360, 3576, 3583, 4066, 4380, 5049, 5263, 5466, 5488, 5797, 5850, 6419, 6460,
6714, 568, 950, 1455, 1986, 6501, 5045, 3816]},
{"label": "PS", "rows": [108, 149, 960, 342, 1199, 2264, 6266, 6267, 5977, 4614, 6037, 6110]},
{"label": "PCF", "rows": [133, 69, 66, 316, 1055, 1406, 1412, 1413, 1414, 1457, 1470, 1499, 1803, 2350, 2371, 2411,
3765, 5047, 5088, 6202, 6321, 6323]},
{"label": "MoDem", "rows": [106, 113, 84, 85, 532, 1036, 1095, 1143, 1163, 1227, 1423, 1477, 2127, 2326, 2357, 2591,
2638, 3157, 3487, 3659, 3679, 3825, 4131, 4331, 4368, 5026, 5996, 6077, 6343, 6366,
6413, 6434, 6623, 6648, 2065]},
{"label": "Agir", "rows": [218, 225, 309, 1884, 3752]},
{"label": "UDI", "rows": [231, 272, 632, 1120, 1292, 1467, 1514, 1531, 1586, 1790, 3522, 3807, 3813, 3924, 6440]},
{"label": "EELV", "rows": [1675, 2128, 3641, 6054]},
{"label": "Gen", "rows": [6264, 6397]},
{"label": "PlPublique", "rows": [1209]},
{"label": "MRSL", "rows": [6494]},
{"label": "GRS", "rows": [3884, 1195]}
]
input_file_2_lists = [
{"label": "no party", "rows": [987, 1171, 795, 42, 988, 1016, 1023, 1071, 1069, 1066, 1060, 1041, 1018, 842, 833,
831, 829, 819, 295, 44, 132, 185, 266, 595, 621, 634, 758]},
{"label": "LREM", "rows": [1015, 828]},
{"label": "LFI", "rows": [34, 48, 103, 261, 353, 339, 390, 1136, 765]},
{"label": "Gen", "rows": [1106]},
{"label": "PS", "rows": [368]},
{"label": "EELV", "rows": [24, 765]}
]
input_file_3_lists = [
{"label": "GRS", "rows": [59]},
{"label": "no party", "rows": [561]},
{"label": "EELV", "rows": [68]}
]
input_file_4_lists = [
{"label": "no party", "rows": [3, 9, 12, 170, 164, 429, 441, 434, 432, 402, 415, 194, 148]},
{"label": "RN", "rows": [23]},
{"label": "UDI", "rows": [129]}
]
input_file_5_lists = [
{"label": "no party", "rows": [2, 9, 21, 26, 44]}
]
input_file_6_lists = [
{"label": "no party", "rows": [3, 4, 11, 13, 14, 27, 30, 36, 42, 47, 49, 50, 53, 55, 79, 92, 94, 106, 107, 139, 140,
141, 148]},
{"label": "EELV", "rows": [2, 37]},
{"label": "LREM", "rows": [41]}
]
input_file_7_lists = [
{"label": "no party", "rows": [4, 18]}
]
input_file_8_lists = [
{"label": "PCF", "rows": [60, 116, 144]},
{"label": "NouvDonne", "rows": [57]},
{"label": "LFI", "rows": [44, 45]},
{"label": "PS", "rows": [29]}
]
input_file_9_lists = [
{"label": "no party", "rows": [3, 4, 13, 39, 87]},
{"label": "LREM", "rows": [2]},
{"label": "LFI", "rows": [8]}
]
# Create a dictionary to store the lists for each input file
input_list_dicts = {
os.path.join("data", "input_file_1.csv"): input_file_1_lists,
os.path.join("data", "input_file_2.csv"): input_file_2_lists,
os.path.join("data", "input_file_3.csv"): input_file_3_lists,
os.path.join("data", "input_file_4.csv"): input_file_4_lists,
os.path.join("data", "input_file_5.csv"): input_file_5_lists,
os.path.join("data", "input_file_6.csv"): input_file_6_lists,
os.path.join("data", "input_file_7.csv"): input_file_7_lists,
os.path.join("data", "input_file_8.csv"): input_file_8_lists,
os.path.join("data", "input_file_9.csv"): input_file_9_lists,
}
# Function to filter and extract data
def filter_and_extract_data(input_file, row_number_dicts, output_data):
with open(input_file, "r", newline='', encoding='utf-8') as csvfile:
csvreader = csv.DictReader(csvfile)
data = [row for row in csvreader]
for row_number_dict in row_number_dicts:
label = row_number_dict["label"]
row_numbers = row_number_dict["rows"]
# Get the corresponding "from_user_name" values for the row numbers
user_names = [data[row_num - 1]['from_user_name'] for row_num in row_numbers if 0 < row_num <= len(data)]
unique_user_names = set(user_names) # Remove duplicates
# Add the unique "from_user_name" values to the output_data for the corresponding criteria
output_data[label].update(unique_user_names)
# Prepare a dictionary to store the output data for each criteria
output_data = defaultdict(set)
# Filter and extract data for each input file
for input_file, row_number_dicts in input_list_dicts.items():
filter_and_extract_data(input_file, row_number_dicts, output_data)
# Write the output data to the corresponding criteria output files
for label, user_names in output_data.items():
output_file = f"{label}_output.txt"
with open(output_file, "w", newline='', encoding='utf-8') as txtfile:
txtfile.write("from_user_name\n") # Write header in the output txt file
for user_name in user_names:
txtfile.write(f"{user_name}\n")