From 1d545f15a784f4f5058ddc990e616dd1c1eb9f82 Mon Sep 17 00:00:00 2001
From: tpc2233 <107653069+tpc2233@users.noreply.github.com>
Date: Sun, 13 Oct 2024 00:31:07 -0300
Subject: [PATCH 1/4] Delete app_multigpu_engine.py

---
 app_multigpu_engine.py | 121 -----------------------------------------
 1 file changed, 121 deletions(-)
 delete mode 100644 app_multigpu_engine.py

diff --git a/app_multigpu_engine.py b/app_multigpu_engine.py
deleted file mode 100644
index 968ec23..0000000
--- a/app_multigpu_engine.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import os
-import torch
-import sys
-import argparse
-from diffusers.utils import export_to_video
-from pyramid_dit import PyramidDiTForVideoGeneration
-from trainer_misc import init_distributed_mode, init_sequence_parallel_group
-from PIL import Image
-
-def get_args():
-    parser = argparse.ArgumentParser('Pytorch Multi-process Script', add_help=False)
-    parser.add_argument('--model_dtype', default='bf16', type=str, help="The Model Dtype: bf16")
-    parser.add_argument('--model_path', required=True, type=str, help='Path to the downloaded checkpoint directory')
-    parser.add_argument('--variant', default='diffusion_transformer_768p', type=str,)
-    parser.add_argument('--task', default='t2v', type=str, choices=['i2v', 't2v'])
-    parser.add_argument('--temp', default=16, type=int, help='The generated latent num, num_frames = temp * 8 + 1')
-    parser.add_argument('--sp_group_size', default=2, type=int, help="The number of GPUs used for inference, should be 2 or 4")
-    parser.add_argument('--sp_proc_num', default=-1, type=int, help="The number of processes used for video training, default=-1 means using all processes.")
-    parser.add_argument('--prompt', type=str, required=True, help="Text prompt for video generation")
-    parser.add_argument('--image_path', type=str, help="Path to the input image for image-to-video")
-    parser.add_argument('--video_guidance_scale', type=float, default=5.0, help="Video guidance scale")
-    parser.add_argument('--guidance_scale', type=float, default=9.0, help="Guidance scale for text-to-video")
-    parser.add_argument('--resolution', type=str, default='768p', choices=['768p', '384p'], help="Model resolution")
-    parser.add_argument('--output_path', type=str, required=True, help="Path to save the generated video")
-    return parser.parse_args()
-
-def main():
-    args = get_args()
-
-    # setup DDP
-    init_distributed_mode(args)
-
-    assert args.world_size == args.sp_group_size, "The sequence parallel size should match DDP world size"
-
-    # Enable sequence parallel
-    init_sequence_parallel_group(args)
-
-    device = torch.device('cuda')
-    rank = args.rank
-    model_dtype = args.model_dtype
-
-    model = PyramidDiTForVideoGeneration(
-        args.model_path,
-        model_dtype,
-        model_variant=args.variant,
-    )
-
-    model.vae.to(device)
-    model.dit.to(device)
-    model.text_encoder.to(device)
-    model.vae.enable_tiling()
-
-    if model_dtype == "bf16":
-        torch_dtype = torch.bfloat16 
-    elif model_dtype == "fp16":
-        torch_dtype = torch.float16
-    else:
-        torch_dtype = torch.float32
-
-    # The video generation config
-    if args.resolution == '768p':
-        width = 1280
-        height = 768
-    else:
-        width = 640
-        height = 384
-
-    try:
-        if args.task == 't2v':
-            prompt = args.prompt
-            with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):
-                frames = model.generate(
-                    prompt=prompt,
-                    num_inference_steps=[20, 20, 20],
-                    video_num_inference_steps=[10, 10, 10],
-                    height=height,
-                    width=width,
-                    temp=args.temp,
-                    guidance_scale=args.guidance_scale,
-                    video_guidance_scale=args.video_guidance_scale,
-                    output_type="pil",
-                    save_memory=True,
-                    cpu_offloading=False,
-                    inference_multigpu=True,
-                )
-            if rank == 0:
-                export_to_video(frames, args.output_path, fps=24)
-
-        elif args.task == 'i2v':
-            if not args.image_path:
-                raise ValueError("Image path is required for image-to-video task")
-            image = Image.open(args.image_path).convert("RGB")
-            image = image.resize((width, height))
-
-            prompt = args.prompt
-
-            with torch.no_grad(), torch.cuda.amp.autocast(enabled=True if model_dtype != 'fp32' else False, dtype=torch_dtype):
-                frames = model.generate_i2v(
-                    prompt=prompt,
-                    input_image=image,
-                    num_inference_steps=[10, 10, 10],
-                    temp=args.temp,
-                    video_guidance_scale=args.video_guidance_scale,
-                    output_type="pil",
-                    save_memory=True,
-                    cpu_offloading=False,
-                    inference_multigpu=True,
-                )
-            if rank == 0:
-                export_to_video(frames, args.output_path, fps=24)
-
-    except Exception as e:
-        if rank == 0:
-            print(f"[ERROR] Error during video generation: {e}")
-        raise
-    finally:
-        torch.distributed.barrier()
-
-if __name__ == "__main__":
-    main()
-

From ee86b467f9e337912d31c72f3c339e2ff89712c3 Mon Sep 17 00:00:00 2001
From: tpc2233 <107653069+tpc2233@users.noreply.github.com>
Date: Sun, 13 Oct 2024 00:31:22 -0300
Subject: [PATCH 2/4] Delete app_multigpu_engine.sh

---
 app_multigpu_engine.sh | 44 ------------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 app_multigpu_engine.sh

diff --git a/app_multigpu_engine.sh b/app_multigpu_engine.sh
deleted file mode 100644
index f409c02..0000000
--- a/app_multigpu_engine.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-# Usage:
-# ./app_multigpu_engine.sh GPUS VARIANT MODEL_PATH TASK TEMP GUIDANCE_SCALE VIDEO_GUIDANCE_SCALE RESOLUTION OUTPUT_PATH [IMAGE_PATH] PROMPT
-
-GPUS=$1
-VARIANT=$2
-MODEL_PATH=$3
-TASK=$4
-TEMP=$5
-GUIDANCE_SCALE=$6
-VIDEO_GUIDANCE_SCALE=$7
-RESOLUTION=$8
-OUTPUT_PATH=$9
-shift 9
-# Now the remaining arguments are $@
-
-if [ "$TASK" == "t2v" ]; then
-    PROMPT="$1"
-    IMAGE_ARG=""
-elif [ "$TASK" == "i2v" ]; then
-    IMAGE_PATH="$1"
-    PROMPT="$2"
-    IMAGE_ARG="--image_path $IMAGE_PATH"
-else
-    echo "Invalid task: $TASK"
-    exit 1
-fi
-
-torchrun --nproc_per_node="$GPUS" \
-    app_multigpu_engine.py \
-    --model_path "$MODEL_PATH" \
-    --variant "$VARIANT" \
-    --task "$TASK" \
-    --model_dtype bf16 \
-    --temp "$TEMP" \
-    --sp_group_size "$GPUS" \
-    --guidance_scale "$GUIDANCE_SCALE" \
-    --video_guidance_scale "$VIDEO_GUIDANCE_SCALE" \
-    --resolution "$RESOLUTION" \
-    --output_path "$OUTPUT_PATH" \
-    --prompt "$PROMPT" \
-    $IMAGE_ARG
-

From 6d38b17af58b05a356382d1ad7e5923b8e2c655c Mon Sep 17 00:00:00 2001
From: tpc2233 <107653069+tpc2233@users.noreply.github.com>
Date: Sun, 13 Oct 2024 00:33:33 -0300
Subject: [PATCH 3/4] Moving gradio_multi-GPU engine to Scripts folder

Code cleanup
updates to python paths
---
 scripts/app_multigpu_engine.py | 128 +++++++++++++++++++++++++++++++++
 scripts/app_multigpu_engine.sh |  55 ++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 scripts/app_multigpu_engine.py
 create mode 100644 scripts/app_multigpu_engine.sh

diff --git a/scripts/app_multigpu_engine.py b/scripts/app_multigpu_engine.py
new file mode 100644
index 0000000..9e6ede1
--- /dev/null
+++ b/scripts/app_multigpu_engine.py
@@ -0,0 +1,128 @@
+import os
+import sys
+import torch
+import argparse
+from PIL import Image
+from diffusers.utils import export_to_video
+
+# Add the project root directory to sys.path
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+from pyramid_dit import PyramidDiTForVideoGeneration
+from trainer_misc import init_distributed_mode, init_sequence_parallel_group
+
+def get_args():
+    parser = argparse.ArgumentParser('Pytorch Multi-process Script', add_help=False)
+    parser.add_argument('--model_dtype', default='bf16', type=str, help="The Model Dtype: bf16")
+    parser.add_argument('--model_path', required=True, type=str, help='Path to the downloaded checkpoint directory')
+    parser.add_argument('--variant', default='diffusion_transformer_768p', type=str)
+    parser.add_argument('--task', default='t2v', type=str, choices=['i2v', 't2v'])
+    parser.add_argument('--temp', default=16, type=int, help='The generated latent num, num_frames = temp * 8 + 1')
+    parser.add_argument('--sp_group_size', default=2, type=int, help="The number of GPUs used for inference, should be 2 or 4")
+    parser.add_argument('--sp_proc_num', default=-1, type=int, help="The number of processes used for video training, default=-1 means using all processes.")
+    parser.add_argument('--prompt', type=str, required=True, help="Text prompt for video generation")
+    parser.add_argument('--image_path', type=str, help="Path to the input image for image-to-video")
+    parser.add_argument('--video_guidance_scale', type=float, default=5.0, help="Video guidance scale")
+    parser.add_argument('--guidance_scale', type=float, default=9.0, help="Guidance scale for text-to-video")
+    parser.add_argument('--resolution', type=str, default='768p', choices=['768p', '384p'], help="Model resolution")
+    parser.add_argument('--output_path', type=str, required=True, help="Path to save the generated video")
+    return parser.parse_args()
+
+def main():
+    args = get_args()
+
+    # Setup DDP
+    init_distributed_mode(args)
+
+    assert args.world_size == args.sp_group_size, "The sequence parallel size should match DDP world size"
+
+    # Enable sequence parallel
+    init_sequence_parallel_group(args)
+
+    device = torch.device('cuda')
+    rank = args.rank
+    model_dtype = args.model_dtype
+
+    model = PyramidDiTForVideoGeneration(
+        args.model_path,
+        model_dtype,
+        model_variant=args.variant,
+    )
+
+    model.vae.to(device)
+    model.dit.to(device)
+    model.text_encoder.to(device)
+    model.vae.enable_tiling()
+
+    if model_dtype == "bf16":
+        torch_dtype = torch.bfloat16 
+    elif model_dtype == "fp16":
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+
+    # The video generation config
+    if args.resolution == '768p':
+        width = 1280
+        height = 768
+    else:
+        width = 640
+        height = 384
+
+    try:
+        if args.task == 't2v':
+            prompt = args.prompt
+            with torch.no_grad(), torch.cuda.amp.autocast(enabled=(model_dtype != 'fp32'), dtype=torch_dtype):
+                frames = model.generate(
+                    prompt=prompt,
+                    num_inference_steps=[20, 20, 20],
+                    video_num_inference_steps=[10, 10, 10],
+                    height=height,
+                    width=width,
+                    temp=args.temp,
+                    guidance_scale=args.guidance_scale,
+                    video_guidance_scale=args.video_guidance_scale,
+                    output_type="pil",
+                    save_memory=True,
+                    cpu_offloading=False,
+                    inference_multigpu=True,
+                )
+            if rank == 0:
+                export_to_video(frames, args.output_path, fps=24)
+
+        elif args.task == 'i2v':
+            if not args.image_path:
+                raise ValueError("Image path is required for image-to-video task")
+            image = Image.open(args.image_path).convert("RGB")
+            image = image.resize((width, height))
+
+            prompt = args.prompt
+
+            with torch.no_grad(), torch.cuda.amp.autocast(enabled=(model_dtype != 'fp32'), dtype=torch_dtype):
+                frames = model.generate_i2v(
+                    prompt=prompt,
+                    input_image=image,
+                    num_inference_steps=[10, 10, 10],
+                    temp=args.temp,
+                    video_guidance_scale=args.video_guidance_scale,
+                    output_type="pil",
+                    save_memory=True,
+                    cpu_offloading=False,
+                    inference_multigpu=True,
+                )
+            if rank == 0:
+                export_to_video(frames, args.output_path, fps=24)
+
+    except Exception as e:
+        if rank == 0:
+            print(f"[ERROR] Error during video generation: {e}")
+        raise
+    finally:
+        torch.distributed.barrier()
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/app_multigpu_engine.sh b/scripts/app_multigpu_engine.sh
new file mode 100644
index 0000000..a2b97f4
--- /dev/null
+++ b/scripts/app_multigpu_engine.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Usage:
+# ./scripts/app_multigpu_engine.sh GPUS VARIANT MODEL_PATH TASK TEMP GUIDANCE_SCALE VIDEO_GUIDANCE_SCALE RESOLUTION OUTPUT_PATH [IMAGE_PATH] PROMPT
+
+GPUS=$1
+VARIANT=$2
+MODEL_PATH=$3
+TASK=$4
+TEMP=$5
+GUIDANCE_SCALE=$6
+VIDEO_GUIDANCE_SCALE=$7
+RESOLUTION=$8
+OUTPUT_PATH=$9
+shift 9
+# Now the remaining arguments are $@
+
+if [ "$TASK" == "t2v" ]; then
+    PROMPT="$1"
+    IMAGE_ARG=""
+elif [ "$TASK" == "i2v" ]; then
+    IMAGE_PATH="$1"
+    PROMPT="$2"
+    IMAGE_ARG="--image_path $IMAGE_PATH"
+else
+    echo "Invalid task: $TASK"
+    exit 1
+fi
+
+# Get the directory where the script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# Get the project root directory (parent directory of scripts)
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+# Set PYTHONPATH to include the project root directory
+export PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH"
+
+# Adjust the path to app_multigpu_engine.py
+PYTHON_SCRIPT="$SCRIPT_DIR/app_multigpu_engine.py"
+
+torchrun --nproc_per_node="$GPUS" \
+    "$PYTHON_SCRIPT" \
+    --model_path "$MODEL_PATH" \
+    --variant "$VARIANT" \
+    --task "$TASK" \
+    --model_dtype bf16 \
+    --temp "$TEMP" \
+    --sp_group_size "$GPUS" \
+    --guidance_scale "$GUIDANCE_SCALE" \
+    --video_guidance_scale "$VIDEO_GUIDANCE_SCALE" \
+    --resolution "$RESOLUTION" \
+    --output_path "$OUTPUT_PATH" \
+    --prompt "$PROMPT" \
+    $IMAGE_ARG
+

From ffa02b045716aeed656a09b6a8a777cabab8a0a0 Mon Sep 17 00:00:00 2001
From: tpc2233 <107653069+tpc2233@users.noreply.github.com>
Date: Sun, 13 Oct 2024 00:35:21 -0300
Subject: [PATCH 4/4] Update app_multigpu.py moving engine to scripts folder

code cleanup
update python paths
engine scripts are now in the scripts folder
---
 app_multigpu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/app_multigpu.py b/app_multigpu.py
index 1800b5d..01e144c 100644
--- a/app_multigpu.py
+++ b/app_multigpu.py
@@ -14,7 +14,7 @@ def run_inference_multigpu(gpus, variant, model_path, temp, guidance_scale, vide
         output_video = os.path.join(tmpdir, f"{uuid.uuid4()}_output.mp4")
         
         # Path to the external shell script
-        script_path = "./app_multigpu_engine.sh"  # Updated script name
+        script_path = "./scripts/app_multigpu_engine.sh"  # Updated script path
 
         # Prepare the command
         cmd = [
@@ -141,4 +141,3 @@ def generate_text_to_video(prompt, temp, guidance_scale, video_guidance_scale, r
 
 # Launch Gradio app
 demo.launch(share=True)
-