Make quantize_pt2 return an ExportedProgram instead of a GraphModule

mcremon-meta · facebook-github-bot · commit d26ff694e9ee · 2025-05-01T17:51:26.000-07:00
Summary:
This will help differentiating the fp32 models from the quantized models, and prevent people from using the wrong APIs.
For fp32 cases, we have a `torch.nn.Module`, which we trace and then lower. For quantized cases, we trace, quantize, and lower.

After this diff, `export_to_&lt;edge, executorch&gt;` will ONLY handle non-quantized cases, and importantly, the sequence of `quantize_pt2` and then `export_to_&lt;edge, executorch&gt;` will not work anymore. Those cases should use the (existing) `lower_ep_to_&lt;edge, executorch&gt;` instead.

Differential Revision: D73722640
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -151,16 +151,14 @@ def quantize_pt2(
     quantizer: Optional[CadenceQuantizer] = None,
     calibration_data: Optional[list[tuple[object, ...]]] = None,
     dump_graphs: bool = False,
-) -> torch.fx.GraphModule:
+) -> ExportedProgram:
     """
     Trace, prepare, convert and fuse the model using the given quantizer.
     If calibration data is provided, it will be used to calibrate the model. If
     not, the inputs will be used for calibration instead, which is useful for
     unit tests but should not be used for end-to-end use cases.
     Returns a GraphModule with the quantized model.
     """
-    # Make the model inference mode by calling model.eval()
-    model.eval()
 
     # Instantiate the quantizer to CadenceQuantizer if not supplied
     if not quantizer:
@@ -178,7 +176,9 @@ def quantize_pt2(
         logging.info("Graph after quantization and fusion:")
         logging.info(fused_gm.graph.print_tabular())
 
-    return fused_gm
+    program = torch.export.export(fused_gm, inputs, strict=True)
+
+    return program
 
 
 # Export the model and lower it to an ExportedProgram (in aten IR)
@@ -260,21 +260,43 @@ def quantize_and_export_to_edge(
     dump_graphs: bool = False,
     constant_methods: Optional[dict[str, object]] = None,
 ) -> EdgeProgramManager:
+    """
+    Trace, quantize and lower a model/inputs pair to edge IR.
+    """
     quantized_model = quantize_pt2(
         model,
         inputs,
         quantizer=quantizer,
         dump_graphs=dump_graphs,
     )
 
-    return export_to_edge(
+    return lower_ep_to_edge(
         quantized_model,
-        inputs,
         dump_graphs=dump_graphs,
         constant_methods=constant_methods,
     )
 
 
+def lower_ep_to_cadence(
+    program: ExportedProgram,
+    dump_graphs: bool = False,
+    opt_level: int = 1,
+) -> EdgeProgramManager:
+    """
+    Lower an existing ExportedProgram to edge IR and apply frontend optimization passes.
+    """
+    edge_prog_manager = lower_ep_to_edge(program, dump_graphs=dump_graphs)
+    cadence_passes = get_cadence_passes(opt_level)
+
+    # Run a couple required passes for quant/dequant ops
+    cadence_prog_manager = edge_prog_manager.transform(
+        cast(
+            list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes
+        )
+    )
+    return cadence_prog_manager
+
+
 def export_to_cadence(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
@@ -299,11 +321,14 @@ def quantize_and_export_to_cadence(
     dump_graphs: bool = False,
     opt_level: int = 1,
 ) -> EdgeProgramManager:
+    """
+    Trace, quantize, lower a model/inputs pair to edge IR and apply frontend
+    optimization passes.
+    """
     quantized_model = quantize_pt2(model, inputs)
 
-    return export_to_cadence(
+    return lower_ep_to_cadence(
         quantized_model,
-        inputs,
         opt_level=opt_level,
         dump_graphs=dump_graphs,
     )
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -113,9 +113,8 @@ def forward(self, x, y):
         Y = torch.randn(y_shape)
         p = ReplaceMatmulWithTransposedMatmulPass()
         inputs = (X, Y)
-        quantized_model = quantize_pt2(model, inputs)
         graph_module = (
-            export_to_edge(quantized_model, inputs).exported_program().graph_module
+            quantize_and_export_to_edge(model, inputs).exported_program().graph_module
         )
         # pyre-fixme[16]: Optional type has no attribute `graph_module`
         graph_after_passes = p(graph_module).graph_module