align default custom black/white list for dygraph and static graph (#9340)

zhiqiu · web-flow · commit 26b73c26440b · 2024-10-30T23:10:25.000+08:00
diff --git a/llm/auto_parallel/llama/README.md b/llm/auto_parallel/llama/README.md
@@ -5,6 +5,9 @@
 - 动静统一自动并行组网[modeling_auto.py](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/llama/modeling_auto.py)，当前主要支持预训练，包括动态图和动转静训练，未来会扩展支持 SFT 等流程。
 
 ## 2. 预训练准备
+
+安装最新的 Paddle，建议使用 nightly 版本，请前往 [Paddle 官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html) 进行安装。
+
 下载预先处理好的数据，并解压到 `./data` 目录下：
 ```shell
 # llama 模型数据下载
diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py
@@ -115,6 +115,7 @@ def _wrap_for_dist_loader(self, train_dataloader):
         return dist_loader
 
     def _wrap_for_auto(self, model, train_dataloader):
+        logger.info("Wrapping model for auto paralle")
         dist_loader = self._wrap_for_dist_loader(train_dataloader)
 
         if ShardingOption.SHARD_OP in self.args.sharding:
@@ -135,6 +136,15 @@ def _wrap_for_auto(self, model, train_dataloader):
         if self.args.to_static:
             unified_strategy = dist.Strategy()
             unified_strategy._from_legacy_strategy(self.args.strategy)
+
+            # same logic as autocast_smart_context_manager() in trainer.py
+            if self.enable_autocast_context_manager:
+                unified_strategy.amp.custom_black_list.extend(["reduce_sum", "c_softmax_with_cross_entropy"])
+                if self.args.fp16_opt_level == "O2":
+                    print("custom_white_list", unified_strategy.amp.custom_white_list, flush=1)
+                    unified_strategy.amp.custom_white_list.extend(["lookup_table", "lookup_table_v2"])
+                    print("custom_white_list", unified_strategy.amp.custom_white_list, flush=1)
+
             # dist.to_static() obtains the input spec information through next(dataloader), but this has side effects
             # on the passed-in dataloader, altering the state of the sampler of the dataloader. In some cases, once
             # the state of the sampler is changed, it cannot be reverted. Therefore, a temporary dataloader is
@@ -156,9 +166,10 @@ def _wrap_amp_model(self, args, model):
                 master_grad=self.args.amp_master_grad,
                 excluded_layers=QuantizationLinear,
             )
+        self.enable_autocast_context_manager = True
+
         if args.to_static:
             return
-        self.enable_autocast_context_manager = True
         self.do_grad_scaling = True if self.args.fp16 else False
         self.scaler = dist.shard_scaler(paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss))