-
Notifications
You must be signed in to change notification settings - Fork 5
/
eval_controlnet_sdxl_light.py
284 lines (245 loc) · 9.08 KB
/
eval_controlnet_sdxl_light.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import os
import time
import torch
import shutil
import argparse
import numpy as np
from tqdm import tqdm
from PIL import Image
from datasets import load_dataset
from accelerate import Accelerator
from diffusers.utils import load_image
from diffusers import (
AutoencoderKL,
StableDiffusionXLControlNetPipeline,
ControlNetModel,
UNet2DConditionModel,
)
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
# Define the function to parse arguments
def parse_args(input_args=None):
parser = argparse.ArgumentParser(description="Simple example of a ControlNet evaluation script.")
parser.add_argument(
"--pretrained_model_name_or_path",
type=str,
default=None,
required=True,
help="Path to pretrained model or model identifier from huggingface.co/models.",
)
parser.add_argument(
"--pretrained_vae_model_name_or_path",
type=str,
default=None,
help="Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.",
)
parser.add_argument(
"--controlnet_model_name_or_path",
type=str,
default=None,
required=True,
help="Path to pretrained controlnet model.",
)
parser.add_argument(
"--output_dir",
type=str,
default=None,
required=True,
help="Path to output results.",
)
parser.add_argument(
"--dataset",
type=str,
default="nickpai/coco2017-colorization",
help="Dataset used"
)
parser.add_argument(
"--dataset_revision",
type=str,
default="caption-free",
choices=["main", "caption-free", "custom-caption"],
help="Revision option (main/caption-free/custom-caption)"
)
parser.add_argument(
"--mixed_precision",
type=str,
default=None,
choices=["no", "fp16", "bf16"],
help=(
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
),
)
parser.add_argument(
"--variant",
type=str,
default=None,
help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
)
parser.add_argument(
"--revision",
type=str,
default=None,
required=False,
help="Revision of pretrained model identifier from huggingface.co/models.",
)
parser.add_argument(
"--num_inference_steps",
type=int,
default=8,
help="1-step, 2-step, 4-step, or 8-step distilled models"
)
parser.add_argument(
"--repo",
type=str,
default="ByteDance/SDXL-Lightning",
required=True,
help="Repository from huggingface.co",
)
parser.add_argument(
"--ckpt",
type=str,
default="sdxl_lightning_4step_unet.safetensors",
required=True,
help="Available checkpoints from the repository",
)
parser.add_argument(
"--negative_prompt",
action="store_true",
help="The prompt or prompts not to guide the image generation",
)
if input_args is not None:
args = parser.parse_args(input_args)
else:
args = parser.parse_args()
return args
def apply_color(image, color_map):
# Convert input images to LAB color space
image_lab = image.convert('LAB')
color_map_lab = color_map.convert('LAB')
# Split LAB channels
l, a, b = image_lab.split()
_, a_map, b_map = color_map_lab.split()
# Merge LAB channels with color map
merged_lab = Image.merge('LAB', (l, a_map, b_map))
# Convert merged LAB image back to RGB color space
result_rgb = merged_lab.convert('RGB')
return result_rgb
def main(args):
generator = torch.manual_seed(0)
# Path to the eval_results folder
eval_results_folder = os.path.join(args.output_dir, "results")
# Remove eval_results folder if it exists
if os.path.exists(eval_results_folder):
shutil.rmtree(eval_results_folder)
# Create directory for eval_results
os.makedirs(eval_results_folder)
# Create subfolders for compare and colorized images
compare_folder = os.path.join(eval_results_folder, "compare")
colorized_folder = os.path.join(eval_results_folder, "colorized")
os.makedirs(compare_folder)
os.makedirs(colorized_folder)
# Load the validation split of the colorization dataset
val_dataset = load_dataset(args.dataset, split="validation", revision=args.dataset_revision)
accelerator = Accelerator(
mixed_precision=args.mixed_precision,
)
weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16":
weight_dtype = torch.float16
elif accelerator.mixed_precision == "bf16":
weight_dtype = torch.bfloat16
vae_path = (
args.pretrained_model_name_or_path
if args.pretrained_vae_model_name_or_path is None
else args.pretrained_vae_model_name_or_path
)
vae = AutoencoderKL.from_pretrained(
vae_path,
subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
revision=args.revision,
variant=args.variant,
)
unet = UNet2DConditionModel.from_config(
args.pretrained_model_name_or_path,
subfolder="unet",
revision=args.revision,
variant=args.variant,
)
unet.load_state_dict(load_file(hf_hub_download(args.repo, args.ckpt)))
# Move vae, unet and text_encoder to device and cast to weight_dtype
# The VAE is in float32 to avoid NaN losses.
if args.pretrained_vae_model_name_or_path is not None:
vae.to(accelerator.device, dtype=weight_dtype)
else:
vae.to(accelerator.device, dtype=torch.float32)
unet.to(accelerator.device, dtype=weight_dtype)
controlnet = ControlNetModel.from_pretrained(args.controlnet_model_name_or_path, torch_dtype=weight_dtype)
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
args.pretrained_model_name_or_path,
vae=vae,
unet=unet,
controlnet=controlnet,
)
pipe.to(accelerator.device, dtype=weight_dtype)
# Prepare everything with our `accelerator`.
pipe, val_dataset = accelerator.prepare(pipe, val_dataset)
pipe.safety_checker = None
# Counter for processed images
processed_images = 0
# Record start time
start_time = time.time()
# Iterate through the validation dataset
for example in tqdm(val_dataset, desc="Processing Images"):
image_path = example["file_name"]
prompt = []
for caption in example["captions"]:
if isinstance(caption, str):
prompt.append(caption)
elif isinstance(caption, (list, np.ndarray)):
# take a random caption if there are multiple
prompt.append(caption[0])
else:
raise ValueError(
f"Caption column `captions` should contain either strings or lists of strings."
)
negative_prompt = None
if args.negative_prompt:
negative_prompt = [
"low quality, bad quality, low contrast, black and white, bw, monochrome, grainy, blurry, historical, restored, desaturate"
]
# Generate image
ground_truth_image = load_image(image_path).resize((512, 512))
control_image = load_image(image_path).convert("L").convert("RGB").resize((512, 512))
image = pipe(prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=args.num_inference_steps,
generator=generator,
image=control_image).images[0]
# Apply color mapping
image = apply_color(ground_truth_image, image)
# Concatenate images into a row
row_image = np.hstack((np.array(control_image), np.array(image), np.array(ground_truth_image)))
row_image = Image.fromarray(row_image)
# Save row image in the compare folder
compare_output_path = os.path.join(compare_folder, f"{image_path.split('/')[-1]}")
row_image.save(compare_output_path)
# Save colorized image in the colorized folder
colorized_output_path = os.path.join(colorized_folder, f"{image_path.split('/')[-1]}")
image.save(colorized_output_path)
# Increment processed images counter
processed_images += 1
# Record end time
end_time = time.time()
# Calculate total time taken
total_time = end_time - start_time
# Calculate FPS
fps = processed_images / total_time
print("All images processed.")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"FPS: {fps:.2f}")
# Entry point of the script
if __name__ == "__main__":
args = parse_args()
main(args)