-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyolov1.py
426 lines (344 loc) · 15.3 KB
/
yolov1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
import albumentations as A
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset as BaseDataset
from torch.utils.data import default_collate
from torchvision.transforms import ToTensor
from tqdm import tqdm
import utils
def read_annot_file(path):
"""Reads annotations from file in YOLO format.
Each line in file should look like:
<object-class> <x> <y> <width> <height>
Args:
path (str): Path to the file.
Returns:
A list of bboxes where each bbox is represented as a tuple that looks like
(x, y, w, h, class)
"""
bboxes = []
with open(path) as annot_file:
for line in annot_file.read().strip().split("\n"):
line = line.split()
bboxes.append(
tuple([float(bbox_param) for bbox_param in line[1:5]] + [int(line[0])])
)
return bboxes
def collate_fn(batch):
"""Pass this function as collate_fn argument of the DataLoader."""
images, annotations = zip(*batch)
return default_collate(images), annotations
default_resize = A.Compose(
[A.Resize(448, 448)], bbox_params=A.BboxParams(format="yolo")
)
class Dataset(BaseDataset):
def __init__(
self,
img_sets,
augmentations=default_resize,
transforms=ToTensor(),
grid_size=7,
number_of_classes=20,
read_annots_once=True,
):
"""Reads dataset in YOLO format.
Args:
img_set (list of strings): List of pathes to files with pathes to images and
annotations (see convert_voc_labels.py).
augmentations (callable, optional): Albumentations augmentation pipeline or
custom function/transform with same interface. When using albumentations
Compose, pass albumentations.BboxParams("yolo") as bbox_params argument.
transforms (callable, optional): A function/transform that takes in an numpy
array and returns a transformed version. Defaults to
torchvision.transforms.ToTensor().
grid_size (int, optional): YOLO hyperparameter (see paper for details).
Defaults to 7.
number_of_classes (int, optional): Number of classes in dataset.
Defaults to 20.
read_annots_once (bool, optional): If set to True loads annotations in RAM
during dataset initialization. Otherwise reads annotation file every
time __getitem__ is called. Defaults to True.
"""
self.augmentations = augmentations
self.transforms = transforms
self.grid_size = grid_size
self.number_of_classes = number_of_classes
# Reading files with image sets
self.img_pathes = []
self.annot_pathes = []
self.annotations = []
for path_to_set in img_sets:
if read_annots_once:
print(f"Reading annnotations for {path_to_set} image set")
with open(path_to_set) as image_set_file:
for line in tqdm(
image_set_file.read().strip().split("\n"),
disable=(not read_annots_once),
):
splitted_line = line.strip().split(" ")
assert len(splitted_line) == 2, (
f"Something went wrong during reading image set file '{path_to_set}'. "
+ f"Extra spaces in line: '{line}'"
)
self.img_pathes.append(splitted_line[0])
if read_annots_once:
self.annotations.append(read_annot_file(splitted_line[1]))
else:
self.annot_pathes.append(splitted_line[1])
def __len__(self):
"""Returns number of images in dataset."""
return len(self.img_pathes)
def __getitem__(self, index):
"""Returns image after transforms and corresponding annotations."""
# Read image
img = cv2.cvtColor(cv2.imread(self.img_pathes[index]), cv2.COLOR_BGR2RGB)
# Read annotations
if self.annotations:
bboxes = self.annotations[index]
else:
bboxes = read_annot_file(self.annot_pathes[index])
# Apply augmentations
if self.augmentations:
augmented = self.augmentations(image=img, bboxes=bboxes)
img = augmented["image"]
bboxes = augmented["bboxes"]
# Apply transforms
if self.transforms:
img = self.transforms(img)
return img, bboxes
def add_activations(model, activation, *args, **kwargs):
"""Adds activation functions after all torch.nn.Conv2d layers.
Args:
model (torch.nn.Sequential or list): A convolution model or a list of its layers
to which activation functions should be added.
activation: An activation function that will be initialized with the given
parameters and added to the model layers, e.g. torch.nn.LeakyReLU.
*args: Positional arguments to initialize the activation function.
**kwargs: Keyword arguments to initialize the activation function.
Returns:
torch.nn.Sequential: New model.
"""
new_model = []
for l in model:
new_model.append(l)
if isinstance(l, nn.Conv2d):
new_model.append(activation(*args, **kwargs))
return nn.Sequential(*new_model)
class Backbone(nn.Module):
def __init__(self):
"""Original YOLOv1 backbone."""
super().__init__()
self.layers = [
nn.Conv2d(3, 64, 7, 2, 3),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 192, 3, 1, 1),
nn.MaxPool2d(2, 2),
nn.Conv2d(192, 128, 1, 1, 0),
nn.Conv2d(128, 256, 3, 1, 1),
nn.Conv2d(256, 256, 1, 1, 0),
nn.Conv2d(256, 512, 3, 1, 1),
nn.MaxPool2d(2, 2),
nn.Conv2d(512, 256, 1, 1, 0),
nn.Conv2d(256, 512, 3, 1, 1),
nn.Conv2d(512, 256, 1, 1, 0),
nn.Conv2d(256, 512, 3, 1, 1),
nn.Conv2d(512, 256, 1, 1, 0),
nn.Conv2d(256, 512, 3, 1, 1),
nn.Conv2d(512, 256, 1, 1, 0),
nn.Conv2d(256, 512, 3, 1, 1),
nn.Conv2d(512, 256, 1, 1, 0),
nn.Conv2d(256, 1024, 3, 1, 1),
nn.MaxPool2d(2, 2),
nn.Conv2d(1024, 512, 1, 1, 0),
nn.Conv2d(512, 1024, 3, 1, 1),
nn.Conv2d(1024, 512, 1, 1, 0),
nn.Conv2d(512, 1024, 3, 1, 1),
nn.Conv2d(1024, 1024, 3, 1, 1),
nn.Conv2d(1024, 1024, 3, 2, 1),
nn.Conv2d(1024, 1024, 3, 1, 1),
nn.Conv2d(1024, 1024, 3, 1, 1),
]
self.layers = add_activations(self.layers, nn.LeakyReLU, negative_slope=0.1)
def forward(self, batch):
return self.layers(batch)
class Model(nn.Module):
def __init__(
self, backbone=None, grid_size=7, number_of_bboxes=2, number_of_classes=20
):
"""Creates YOLOv1 model.
Args:
backbone (callable, optional): Backbone model. Last two layers of the model
will be automatically replaced by adaptive average pooling and 1x1
convolution to match input size of fully connected layers. If not
specified, the default backbone will be used (see Figure 3 in paper for
architecture details).
grid_size (int, optional): YOLO hyperparameter (see paper for details).
Defaults to 7.
number_of_bboxes (int, optional): Number of bounding boxes to predict per
grid cell. Defaults to 2.
number_of_classes (int, optional): Number of classes. Defaults to 20.
"""
super().__init__()
self.grid_size = grid_size
self.number_of_bboxes = number_of_bboxes
self.preds_per_cell = number_of_bboxes * 5 + number_of_classes
if backbone:
self.set_backbone(backbone)
else:
self.backbone = Backbone()
self.fc_layers = nn.Sequential(
nn.Flatten(),
nn.Linear(grid_size * grid_size * 1024, 4096),
nn.LeakyReLU(0.1),
nn.Dropout(0.5),
nn.Linear(4096, grid_size * grid_size * self.preds_per_cell),
)
def set_backbone(self, model):
"""Makes model compitable with fully connected layers and sets it as backbone."""
model = nn.Sequential(*list(model.children())[:-2])
output_channels = model(torch.zeros(1, 3, 448, 448)).shape[1]
self.backbone = nn.Sequential(
model,
nn.AdaptiveAvgPool2d(self.grid_size),
nn.Conv2d(output_channels, 1024, 1),
)
def forward(self, batch):
batch = self.backbone(batch)
batch = self.fc_layers(batch)
batch = batch.reshape(-1, self.grid_size, self.grid_size, self.preds_per_cell)
return batch
@torch.no_grad()
def predict(self, batch, threshold=0.25, iou_threshold=0.5):
"""Runs the model on given batch and decodes its output.
Args:
batch (torch.tensor): Batch of images.
threshold (float, optional): The minimum class-specific confidence score.
This score is the product of the cell's conditional class probability
and bounding box confidence (see paper for details). Defaults to 0.25.
iou_threshold (float, optional): The iou threshold for suppressing extra
bounding boxes using Non-Maximum Supression algorithm. A lower threshold
means stricter filtering. Defaults to 0.5.
Returns:
A list of predictions where each prediction is a list of bounding boxes.
Each bounding box is represented as a tuple that looks like
(x, y, w, h, class, class-specific confidence score, bbox confidence, class probability)
"""
self.eval()
predicted_grids = self(batch)
cell_size = 1 / self.grid_size
predictions = []
for grid in predicted_grids:
current_pred = []
for row in range(grid.shape[0]):
for col in range(grid.shape[1]):
cell = grid[row, col]
# Find bbox with the highest confidence in current cell
max_conf_bbox = None
for bbox_i in range(self.number_of_bboxes):
if (
max_conf_bbox is None
or cell[(bbox_i + 1) * 5 - 1] > max_conf_bbox[-1]
):
max_conf_bbox = cell[bbox_i * 5 : (bbox_i + 1) * 5]
# Find class with the highest confidence score
class_prob, class_index = cell[-20:].max(-1)
class_prob = float(min(1, max(0, class_prob)))
class_index = int(class_index)
# Normalize bounding box parameters
x, y, w, h, bbox_conf = max_conf_bbox
bbox_conf = float(min(1, max(0, bbox_conf)))
# Filtering bboxes by threshold
if class_prob * bbox_conf < threshold:
continue
# Coordinates post-processing
x = (x + col) * cell_size
x = float(min(1, max(0, x)))
y = (y + row) * cell_size
y = float(min(1, max(0, y)))
w = float(min(1, max(0, w**2)))
h = float(min(1, max(0, h**2)))
# Save prediction for current cell
current_pred.append(
(x, y, w, h)
+ (class_index, class_prob * bbox_conf, bbox_conf, class_prob)
)
predictions.append(current_pred)
return utils.nms(predictions, iou_threshold=0.5)
class Loss(nn.Module):
def __init__(self, labmda_coord=5.0, labmda_noobj=0.5):
"""Implementation of the original YOLOv1 loss function.
Args:
labmda_coord (float, optional): Coefficient for bounding box coordinate
predictions (see paper for details). Defaults to 5.
labmda_noobj (float, optional): Coefficient for confidence predictions for
bounding boxes that don't contain objects (see paper for details).
Defaults to 0.5.
"""
super().__init__()
self.lambda_coord = labmda_coord
self.lambda_noobj = labmda_noobj
def forward(self, pred, gt):
"""Calculates loss.
Args:
pred (torch.tensor): Tensor predicted by the model.
gt (list): A list ground truth where each element is a list of bboxes. Each
bbox is represented as a tuple that looks like (x, y, w, h, class).
Returns:
torch.tensor: Tensor of size 1.
"""
loss = 0
cell_size = 1 / pred.shape[1]
for ex_i in range(len(gt)):
cells_with_obj = []
for x, y, w, h, class_index in gt[ex_i]:
# Find grid cell responsible for predicting current bbox
row, col = int(x // cell_size), int(y // cell_size)
# Yolo can predict only one object per grid cell so some bboxes will not
# be taken into account
if (row, col) in cells_with_obj:
continue
cells_with_obj.append((row, col))
# Predict coordinates relative to the bounds of the grid cell
x = x / cell_size - col
y = y / cell_size - row
# Predict the square root of the bounding box width and height
w **= 0.5
h **= 0.5
# Choose predicted bbox with the highest IOU with the ground truth
current_pred = pred[ex_i, row, col]
bbox1_iou = utils.iou(current_pred[:4], (x, y, w, h))
bbox2_iou = utils.iou(current_pred[5:9], (x, y, w, h))
pred_bbox, pred_bbox_iou = (
(current_pred[:5], bbox1_iou)
if bbox1_iou > bbox2_iou
else (current_pred[5:10], bbox2_iou)
)
# Coordinates and size loss
loss += self.lambda_coord * F.mse_loss(
pred_bbox[:4],
torch.tensor([x, y, w, h], dtype=torch.float, device=pred.device),
reduction="sum",
)
# Confidence loss
loss += F.mse_loss(
pred_bbox[4],
torch.tensor(pred_bbox_iou, dtype=torch.float, device=pred.device),
)
# Classification loss
loss += F.mse_loss(
current_pred[-20:],
F.one_hot(torch.tensor(class_index), 20).to(
device=pred.device, dtype=torch.float
),
reduction="sum",
)
# No object loss
confidence_preds = pred[ex_i][..., [4, 9]]
# Don't count no object loss for cells with objects
confidence_preds[tuple(zip(*cells_with_obj))] *= 0
loss += self.lambda_noobj * F.mse_loss(
confidence_preds, torch.zeros_like(confidence_preds), reduction="sum"
)
return loss / len(gt)