From 1d545f15a784f4f5058ddc990e616dd1c1eb9f82 Mon Sep 17 00:00:00 2001 From: tpc2233 <107653069+tpc2233@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:31:07 -0300 Subject: [PATCH 1/4] Delete app_multigpu_engine.py --- app_multigpu_engine.py | 121 ----------------------------------------- 1 file changed, 121 deletions(-) delete mode 100644 app_multigpu_engine.py diff --git a/app_multigpu_engine.py b/app_multigpu_engine.py deleted file mode 100644 index 968ec23..0000000 --- a/app_multigpu_engine.py +++ /dev/null @@ -1,121 +0,0 @@ -import os -import torch -import sys -import argparse -from diffusers.utils import export_to_video -from pyramid_dit import PyramidDiTForVideoGeneration -from trainer_misc import init_distributed_mode, init_sequence_parallel_group -from PIL import Image - -def get_args(): - parser = argparse.ArgumentParser('Pytorch Multi-process Script', add_help=False) - parser.add_argument('--model_dtype', default='bf16', type=str, help="The Model Dtype: bf16") - parser.add_argument('--model_path', required=True, type=str, help='Path to the downloaded checkpoint directory') - parser.add_argument('--variant', default='diffusion_transformer_768p', type=str,) - parser.add_argument('--task', default='t2v', type=str, choices=['i2v', 't2v']) - parser.add_argument('--temp', default=16, type=int, help='The generated latent num, num_frames = temp * 8 + 1') - parser.add_argument('--sp_group_size', default=2, type=int, help="The number of GPUs used for inference, should be 2 or 4") - parser.add_argument('--sp_proc_num', default=-1, type=int, help="The number of processes used for video training, default=-1 means using all processes.") - parser.add_argument('--prompt', type=str, required=True, help="Text prompt for video generation") - parser.add_argument('--image_path', type=str, help="Path to the input image for image-to-video") - parser.add_argument('--video_guidance_scale', type=float, default=5.0, help="Video guidance scale") - parser.add_argument('--guidance_scale', type=float, default=9.0, help="Guidance scale for text-to-video") - parser.add_argument('--resolution', type=str, default='768p', choices=['768p', '384p'], help="Model resolution") - parser.add_argument('--output_path', type=str, required=True, help="Path to save the generated video") - return parser.parse_args() - -def main(): - args = get_args() - - # setup DDP - init_distributed_mode(args) - - assert args.world_size == args.sp_group_size, "The sequence parallel size should match DDP world size" - - # Enable sequence parallel - init_sequence_parallel_group(args) - - device = torch.device('cuda') - rank = args.rank - model_dtype = args.model_dtype - - model = PyramidDiTForVideoGeneration( - args.model_path, - model_dtype, - model_variant=args.variant, - ) - - model.vae.to(device) - model.dit.to(device) - model.text_encoder.to(device) - model.vae.enable_tiling() - - if model_dtype == "bf16": - torch_dtype = torch.bfloat16 - elif model_dtype == "fp16": - torch_dtype = torch.float16 - else: - torch_dtype = torch.float32 - - # The video generation config - if args.resolution == '768p': - width = 1280 - height = 768 - else: - width = 640 - height = 384 - - try: - if args.task == 't2v': - prompt = args.prompt - with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype): - frames = model.generate( - prompt=prompt, - num_inference_steps=[20, 20, 20], - video_num_inference_steps=[10, 10, 10], - height=height, - width=width, - temp=args.temp, - guidance_scale=args.guidance_scale, - video_guidance_scale=args.video_guidance_scale, - output_type="pil", - save_memory=True, - cpu_offloading=False, - inference_multigpu=True, - ) - if rank == 0: - export_to_video(frames, args.output_path, fps=24) - - elif args.task == 'i2v': - if not args.image_path: - raise ValueError("Image path is required for image-to-video task") - image = Image.open(args.image_path).convert("RGB") - image = image.resize((width, height)) - - prompt = args.prompt - - with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype): - frames = model.generate_i2v( - prompt=prompt, - input_image=image, - num_inference_steps=[10, 10, 10], - temp=args.temp, - video_guidance_scale=args.video_guidance_scale, - output_type="pil", - save_memory=True, - cpu_offloading=False, - inference_multigpu=True, - ) - if rank == 0: - export_to_video(frames, args.output_path, fps=24) - - except Exception as e: - if rank == 0: - print(f"[ERROR] Error during video generation: {e}") - raise - finally: - torch.distributed.barrier() - -if __name__ == "__main__": - main() - From ee86b467f9e337912d31c72f3c339e2ff89712c3 Mon Sep 17 00:00:00 2001 From: tpc2233 <107653069+tpc2233@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:31:22 -0300 Subject: [PATCH 2/4] Delete app_multigpu_engine.sh --- app_multigpu_engine.sh | 44 ------------------------------------------ 1 file changed, 44 deletions(-) delete mode 100644 app_multigpu_engine.sh diff --git a/app_multigpu_engine.sh b/app_multigpu_engine.sh deleted file mode 100644 index f409c02..0000000 --- a/app_multigpu_engine.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Usage: -# ./app_multigpu_engine.sh GPUS VARIANT MODEL_PATH TASK TEMP GUIDANCE_SCALE VIDEO_GUIDANCE_SCALE RESOLUTION OUTPUT_PATH [IMAGE_PATH] PROMPT - -GPUS=$1 -VARIANT=$2 -MODEL_PATH=$3 -TASK=$4 -TEMP=$5 -GUIDANCE_SCALE=$6 -VIDEO_GUIDANCE_SCALE=$7 -RESOLUTION=$8 -OUTPUT_PATH=$9 -shift 9 -# Now the remaining arguments are $@ - -if [ "$TASK" == "t2v" ]; then - PROMPT="$1" - IMAGE_ARG="" -elif [ "$TASK" == "i2v" ]; then - IMAGE_PATH="$1" - PROMPT="$2" - IMAGE_ARG="--image_path $IMAGE_PATH" -else - echo "Invalid task: $TASK" - exit 1 -fi - -torchrun --nproc_per_node="$GPUS" \ - app_multigpu_engine.py \ - --model_path "$MODEL_PATH" \ - --variant "$VARIANT" \ - --task "$TASK" \ - --model_dtype bf16 \ - --temp "$TEMP" \ - --sp_group_size "$GPUS" \ - --guidance_scale "$GUIDANCE_SCALE" \ - --video_guidance_scale "$VIDEO_GUIDANCE_SCALE" \ - --resolution "$RESOLUTION" \ - --output_path "$OUTPUT_PATH" \ - --prompt "$PROMPT" \ - $IMAGE_ARG - From 6d38b17af58b05a356382d1ad7e5923b8e2c655c Mon Sep 17 00:00:00 2001 From: tpc2233 <107653069+tpc2233@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:33:33 -0300 Subject: [PATCH 3/4] Moving gradio_multi-GPU engine to Scripts folder Code cleanup updates to python paths --- scripts/app_multigpu_engine.py | 128 +++++++++++++++++++++++++++++++++ scripts/app_multigpu_engine.sh | 55 ++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 scripts/app_multigpu_engine.py create mode 100644 scripts/app_multigpu_engine.sh diff --git a/scripts/app_multigpu_engine.py b/scripts/app_multigpu_engine.py new file mode 100644 index 0000000..9e6ede1 --- /dev/null +++ b/scripts/app_multigpu_engine.py @@ -0,0 +1,128 @@ +import os +import sys +import torch +import argparse +from PIL import Image +from diffusers.utils import export_to_video + +# Add the project root directory to sys.path +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.dirname(SCRIPT_DIR) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +from pyramid_dit import PyramidDiTForVideoGeneration +from trainer_misc import init_distributed_mode, init_sequence_parallel_group + +def get_args(): + parser = argparse.ArgumentParser('Pytorch Multi-process Script', add_help=False) + parser.add_argument('--model_dtype', default='bf16', type=str, help="The Model Dtype: bf16") + parser.add_argument('--model_path', required=True, type=str, help='Path to the downloaded checkpoint directory') + parser.add_argument('--variant', default='diffusion_transformer_768p', type=str) + parser.add_argument('--task', default='t2v', type=str, choices=['i2v', 't2v']) + parser.add_argument('--temp', default=16, type=int, help='The generated latent num, num_frames = temp * 8 + 1') + parser.add_argument('--sp_group_size', default=2, type=int, help="The number of GPUs used for inference, should be 2 or 4") + parser.add_argument('--sp_proc_num', default=-1, type=int, help="The number of processes used for video training, default=-1 means using all processes.") + parser.add_argument('--prompt', type=str, required=True, help="Text prompt for video generation") + parser.add_argument('--image_path', type=str, help="Path to the input image for image-to-video") + parser.add_argument('--video_guidance_scale', type=float, default=5.0, help="Video guidance scale") + parser.add_argument('--guidance_scale', type=float, default=9.0, help="Guidance scale for text-to-video") + parser.add_argument('--resolution', type=str, default='768p', choices=['768p', '384p'], help="Model resolution") + parser.add_argument('--output_path', type=str, required=True, help="Path to save the generated video") + return parser.parse_args() + +def main(): + args = get_args() + + # Setup DDP + init_distributed_mode(args) + + assert args.world_size == args.sp_group_size, "The sequence parallel size should match DDP world size" + + # Enable sequence parallel + init_sequence_parallel_group(args) + + device = torch.device('cuda') + rank = args.rank + model_dtype = args.model_dtype + + model = PyramidDiTForVideoGeneration( + args.model_path, + model_dtype, + model_variant=args.variant, + ) + + model.vae.to(device) + model.dit.to(device) + model.text_encoder.to(device) + model.vae.enable_tiling() + + if model_dtype == "bf16": + torch_dtype = torch.bfloat16 + elif model_dtype == "fp16": + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + + # The video generation config + if args.resolution == '768p': + width = 1280 + height = 768 + else: + width = 640 + height = 384 + + try: + if args.task == 't2v': + prompt = args.prompt + with torch.no_grad(), torch.cuda.amp.autocast(enabled=(model_dtype != 'fp32'), dtype=torch_dtype): + frames = model.generate( + prompt=prompt, + num_inference_steps=[20, 20, 20], + video_num_inference_steps=[10, 10, 10], + height=height, + width=width, + temp=args.temp, + guidance_scale=args.guidance_scale, + video_guidance_scale=args.video_guidance_scale, + output_type="pil", + save_memory=True, + cpu_offloading=False, + inference_multigpu=True, + ) + if rank == 0: + export_to_video(frames, args.output_path, fps=24) + + elif args.task == 'i2v': + if not args.image_path: + raise ValueError("Image path is required for image-to-video task") + image = Image.open(args.image_path).convert("RGB") + image = image.resize((width, height)) + + prompt = args.prompt + + with torch.no_grad(), torch.cuda.amp.autocast(enabled=(model_dtype != 'fp32'), dtype=torch_dtype): + frames = model.generate_i2v( + prompt=prompt, + input_image=image, + num_inference_steps=[10, 10, 10], + temp=args.temp, + video_guidance_scale=args.video_guidance_scale, + output_type="pil", + save_memory=True, + cpu_offloading=False, + inference_multigpu=True, + ) + if rank == 0: + export_to_video(frames, args.output_path, fps=24) + + except Exception as e: + if rank == 0: + print(f"[ERROR] Error during video generation: {e}") + raise + finally: + torch.distributed.barrier() + +if __name__ == "__main__": + main() + diff --git a/scripts/app_multigpu_engine.sh b/scripts/app_multigpu_engine.sh new file mode 100644 index 0000000..a2b97f4 --- /dev/null +++ b/scripts/app_multigpu_engine.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Usage: +# ./scripts/app_multigpu_engine.sh GPUS VARIANT MODEL_PATH TASK TEMP GUIDANCE_SCALE VIDEO_GUIDANCE_SCALE RESOLUTION OUTPUT_PATH [IMAGE_PATH] PROMPT + +GPUS=$1 +VARIANT=$2 +MODEL_PATH=$3 +TASK=$4 +TEMP=$5 +GUIDANCE_SCALE=$6 +VIDEO_GUIDANCE_SCALE=$7 +RESOLUTION=$8 +OUTPUT_PATH=$9 +shift 9 +# Now the remaining arguments are $@ + +if [ "$TASK" == "t2v" ]; then + PROMPT="$1" + IMAGE_ARG="" +elif [ "$TASK" == "i2v" ]; then + IMAGE_PATH="$1" + PROMPT="$2" + IMAGE_ARG="--image_path $IMAGE_PATH" +else + echo "Invalid task: $TASK" + exit 1 +fi + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Get the project root directory (parent directory of scripts) +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +# Set PYTHONPATH to include the project root directory +export PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH" + +# Adjust the path to app_multigpu_engine.py +PYTHON_SCRIPT="$SCRIPT_DIR/app_multigpu_engine.py" + +torchrun --nproc_per_node="$GPUS" \ + "$PYTHON_SCRIPT" \ + --model_path "$MODEL_PATH" \ + --variant "$VARIANT" \ + --task "$TASK" \ + --model_dtype bf16 \ + --temp "$TEMP" \ + --sp_group_size "$GPUS" \ + --guidance_scale "$GUIDANCE_SCALE" \ + --video_guidance_scale "$VIDEO_GUIDANCE_SCALE" \ + --resolution "$RESOLUTION" \ + --output_path "$OUTPUT_PATH" \ + --prompt "$PROMPT" \ + $IMAGE_ARG + From ffa02b045716aeed656a09b6a8a777cabab8a0a0 Mon Sep 17 00:00:00 2001 From: tpc2233 <107653069+tpc2233@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:35:21 -0300 Subject: [PATCH 4/4] Update app_multigpu.py moving engine to scripts folder code cleanup update python paths engine scripts are now in the scripts folder --- app_multigpu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app_multigpu.py b/app_multigpu.py index 1800b5d..01e144c 100644 --- a/app_multigpu.py +++ b/app_multigpu.py @@ -14,7 +14,7 @@ def run_inference_multigpu(gpus, variant, model_path, temp, guidance_scale, vide output_video = os.path.join(tmpdir, f"{uuid.uuid4()}_output.mp4") # Path to the external shell script - script_path = "./app_multigpu_engine.sh" # Updated script name + script_path = "./scripts/app_multigpu_engine.sh" # Updated script path # Prepare the command cmd = [ @@ -141,4 +141,3 @@ def generate_text_to_video(prompt, temp, guidance_scale, video_guidance_scale, r # Launch Gradio app demo.launch(share=True) -