-
Notifications
You must be signed in to change notification settings - Fork 3
/
_0_prep_dataset.py
142 lines (114 loc) · 5.65 KB
/
_0_prep_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import shutil
import random
import uuid
import argparse
from tqdm import tqdm
from PIL import Image
all_img_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp', '.JPEG', '.JPG', '.PNG', '.BMP', '.TIFF', '.TIF', '.WEBP']
def process_file(orig_path, new_path, args):
"""
Given an orig_path and new_path:
1. soft-load with PIL to check if the resolution is within bounds
2. Optionally downsize the image
3. Convert to jpg if necessary
4. Rename or copy the file to the new_path
"""
os.makedirs(os.path.dirname(new_path), exist_ok=True)
file_extension = os.path.splitext(orig_path)[1]
is_image = file_extension in all_img_extensions
converted, resized = 0, 0
if is_image:
img = Image.open(orig_path)
width, height = img.size
if (width * height) > args.max_n_pixels:
new_width = int(width * args.max_n_pixels / (width * height))
new_height = int(height * args.max_n_pixels / (width * height))
img = img.resize((new_width, new_height), Image.ANTIALIAS)
if args.convert_imgs_to_jpg:
new_path = os.path.splitext(new_path)[0] + '.jpg'
img.save(new_path, quality=95)
resized = 1
if args.convert_imgs_to_jpg and not resized:
if file_extension != '.jpg':
new_path = os.path.splitext(new_path)[0] + '.jpg'
img = Image.open(orig_path).convert("RGB")
img.save(new_path, quality=95)
os.remove(orig_path)
converted = 1
if not is_image or (not resized and not converted):
if args.mode == 'rename':
os.rename(orig_path, new_path)
elif args.mode == 'copy':
shutil.copy(orig_path, new_path)
return converted, resized
from natsort import natsorted, ns
def nautilus_sort(filenames):
# Sort filenames naturally and case-insensitively
return natsorted(filenames, alg=ns.IGNORECASE)
def prep_dataset_directory(args):
'''
Rename all the files in the root_dir with a unique string identifier
Optionally:
- convert imgs to jpg
- downsize imgs if needed
'''
os.makedirs(args.output_dir, exist_ok=True)
renamed_counter, converted_counter, resized_counter, skipped = 0, 0, 0, 0
print_verb = "Copied" if args.mode == 'copy' else "Renamed"
for subdir, dirs, files in os.walk(args.root_dir):
print(f"Parsing {subdir}, subdirs: {dirs}, n_files: {len(files)}..")
# Walk through this directory in alphabetical order:
files = nautilus_sort(files)
# Get all the unique filenames (without the extension) and store a list of present extensions for each one:
unique_filenames = {}
for file in files:
filename, file_extension = os.path.splitext(file)
if filename not in unique_filenames:
unique_filenames[filename] = []
unique_filenames[filename].append(file_extension)
# create sorted, but random uuids:
uuids = nautilus_sort([str(uuid.uuid4().hex) for _ in range(len(unique_filenames.keys()))])
if args.shuffle_file_order:
uuids = random.shuffle(uuids)
for i, filename in tqdm(enumerate(unique_filenames.keys())):
extension_list = unique_filenames[filename]
for ext in extension_list:
new_folder = subdir.replace(args.root_dir, args.output_dir)
orig_filename = os.path.join(subdir, filename + ext)
new_filename = os.path.join(new_folder, uuids[i] + ext)
try:
converted, resized = process_file(orig_filename, new_filename, args)
renamed_counter += 1
converted_counter += converted
resized_counter += resized
except Exception as e:
print(f"Error on {orig_filename}: {e}")
skipped += 1
continue
print(f"{print_verb} {renamed_counter} files (converted {converted_counter}, resized {resized_counter}), skipped {skipped}")
if __name__ == "__main__":
"""
This script renames all the files in the root_dir with a unique string identifier,
it also optionally converts all images to jpg and downsizes them if they are very large.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--root_dir', type=str, help='Root directory of the dataset folder')
parser.add_argument('--output_dir', type=str, default = None, help='Output directory')
parser.add_argument('--mode', type=str, default='copy', help='Modes: rename (in place) or copy')
parser.add_argument('--max_n_pixels', type=int, default=2048*2048, help='Resize when an img is larger than this')
parser.add_argument('--convert_imgs_to_jpg', action='store_true', help='Convert all imgs to .jpg (default: False)')
parser.add_argument('--shuffle_file_order', action='store_true', help='Randomly shuffle the alphabetical ordering of imgs (default: False)')
args = parser.parse_args()
if args.mode == 'copy' and args.output_dir is None:
raise ValueError("Output directory must be specified when mode is 'copy'")
if args.output_dir is None:
args.output_dir = args.root_dir
args.mode = 'rename'
if args.mode == 'rename':
print("####### WARNING #######")
print(f"you are about to rename / resize all the files inside {args.root_dir}, are you sure you want to do this?")
answer = input("Type 'yes' to continue: ")
if answer != 'yes':
raise ValueError("Aborted")
prep_dataset_directory(args)