@@ -72,15 +72,27 @@ def forward(self, x):
72
72
# Node: torch.ops.aten.mul.Tensor, with layer location: /mul
73
73
# Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner
74
74
75
+ # %%
76
+ # trt module with cuda graphs
77
+ # ----------------------------------
78
+ #
79
+ # When CUDA Graphs are applied to a TensorRT model that contains graph breaks, each break introduces additional
80
+ # overhead. This occurs because graph breaks prevent the entire model from being executed as a single, continuous
81
+ # optimized unit. As a result, some of the performance benefits typically provided by CUDA Graphs, such as reduced
82
+ # kernel launch overhead and improved execution efficiency, may be diminished.
83
+ with torch_tensorrt .runtime .enable_cudagraphs ():
84
+ trt_model (input )
85
+
75
86
# %%
76
87
# Running wrapped module with cuda graphs
77
88
# ----------------------------------
78
89
#
79
- # Please note that initializing with wrapper module involve warm-up phase where the module
80
- # is executed several times. This ensures that memory allocations and initializations are
81
- # not recorded in CUDA Graphs.
82
- # When using the TensorRT module within a CUDA Graph context manager, a wrapped_module is returned.
83
- # This module captures the execution graph, allowing for efficient replay during subsequent
84
- # inferences by reducing kernel launch overheads and improving performance.
90
+ # Using a wrapped runtime module with CUDA Graphs allows you to encapsulate sequences of operations into graphs
91
+ # that can be executed efficiently, even in the presence of graph breaks. When a CUDA Graph context manager is
92
+ # used with the TensorRT module as a positional argument, it returns a wrapped_module. This module captures the
93
+ # execution graph, enabling efficient replay during subsequent inferences by reducing kernel launch overheads
94
+ # and improving performance. Note that initializing with the wrapper module involves a warm-up phase where the
95
+ # module is executed several times. This warm-up ensures that memory allocations and initializations are not
96
+ # recorded in CUDA Graphs, which helps maintain consistent execution paths and optimize performance.
85
97
with torch_tensorrt .runtime .enable_cudagraphs (trt_model ) as wrapped_module :
86
98
wrapped_module (input )
0 commit comments