diff --git a/paddlemix/examples/cogagent/README.md b/paddlemix/examples/cogvlm/README.md similarity index 62% rename from paddlemix/examples/cogagent/README.md rename to paddlemix/examples/cogvlm/README.md index 71edf133c..0957b7b1b 100644 --- a/paddlemix/examples/cogagent/README.md +++ b/paddlemix/examples/cogvlm/README.md @@ -1,6 +1,15 @@ -# CogAgent +## 1. CogVLM 模型介绍 -## 1. 模型介绍 +该模型是 [CogVLM](https://arxiv.org/abs/2311.03079) 的 paddle 实现。 + +[CogVLM](https://arxiv.org/abs/2311.03079) 是一个强大的开源视觉语言模型(VLM)。CogVLM-17B拥有100亿的视觉参数和70亿的语言参数。 + +CogVLM-17B在10个经典的跨模态基准测试中取得了最佳性能,包括 NoCaps, Flicker30k captioning, RefCOCO, RefCOCO+, RefCOCOg, Visual7W, GQA, ScienceQA, VizWiz VQA and TDIUC, 并在 VQAv2, OKVQA, TextVQA, COCO 字幕等方面排名第二., 超越或匹敌 PaLI-X 55B. CogVLM还可以和你聊关于图片的话题。 + +本仓库提供paddle版本的 cogvlm-chat 模型 + + +## 2. CogAgent 模型介绍 该模型是 [CogAgent](https://arxiv.org/abs/2312.08914) 的 paddle 实现。 @@ -20,19 +29,19 @@ CogAgent-18B在9个经典的跨模态基准测试中实现了最先进的全能 本仓库提供paddle版本的 cogagent-chat 模型 -## 2. 环境准备 +## 3. 环境准备 1) [安装PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP?tab=readme-ov-file#%E5%AE%89%E8%A3%85) 2)[安装 PaddleMix 环境依赖包](https://github.com/PaddlePaddle/PaddleMIX/tree/b4f97ff859e1964c839fc5fab94f7ba63b1e5959?tab=readme-ov-file#%E5%AE%89%E8%A3%85) -## 3. 快速开始 +## 4. 快速开始 完成环境准备后,我们目前提供多轮对话方式使用: ```bash -python paddlemix/examples/cogagent/chat_demo.py \ ---from_pretrained "THUDM/cogagent-chat" +python paddlemix/examples/cogvlm/chat_demo.py \ +--from_pretrained "THUDM/cogvlm-chat" ``` 可配置参数说明: - * `from_pretrained`: 指定CogAgent的模型名字或权重路径以及tokenizer,默认 THUDM/cogagent-chat + * `from_pretrained`: 指定cogvlm的模型名字或权重路径以及tokenizer,默认 THUDM/cogvlm-chat diff --git a/paddlemix/examples/cogagent/chat_demo.py b/paddlemix/examples/cogvlm/chat_demo.py similarity index 98% rename from paddlemix/examples/cogagent/chat_demo.py rename to paddlemix/examples/cogvlm/chat_demo.py index c2f9994cf..5768f9808 100644 --- a/paddlemix/examples/cogagent/chat_demo.py +++ b/paddlemix/examples/cogvlm/chat_demo.py @@ -31,7 +31,7 @@ parser = argparse.ArgumentParser() -parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat", help="pretrained ckpt and tokenizer") +parser.add_argument("--from_pretrained", type=str, default="THUDM/cogvlm-chat", help="pretrained ckpt and tokenizer") args = parser.parse_args() MODEL_PATH = args.from_pretrained TOKENIZER_PATH = MODEL_PATH diff --git a/paddlemix/models/__init__.py b/paddlemix/models/__init__.py index 7246527c8..97f96b039 100644 --- a/paddlemix/models/__init__.py +++ b/paddlemix/models/__init__.py @@ -16,6 +16,8 @@ from .audioldm2.configuration import * from .audioldm2.modeling import * from .blip2.modeling import * +from .cogvlm.configuration import * +from .cogvlm.modeling import * from .imagebind.modeling import * from .imagebind.multimodal_preprocessors import * from .llava import * diff --git a/paddlemix/models/cogvlm/__init__.py b/paddlemix/models/cogvlm/__init__.py new file mode 100644 index 000000000..6709d0167 --- /dev/null +++ b/paddlemix/models/cogvlm/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .configuration import * +from .modeling import * diff --git a/paddlemix/models/cogagent/configuration.py b/paddlemix/models/cogvlm/configuration.py similarity index 94% rename from paddlemix/models/cogagent/configuration.py rename to paddlemix/models/cogvlm/configuration.py index 3e85cc6d9..0dd108a66 100644 --- a/paddlemix/models/cogagent/configuration.py +++ b/paddlemix/models/cogvlm/configuration.py @@ -17,7 +17,7 @@ from paddlenlp import transformers -class CogAgentConfig(transformers.PretrainedConfig): +class CogModelConfig(transformers.PretrainedConfig): _auto_class = "AutoConfig" def __init__( @@ -40,6 +40,7 @@ def __init__( eos_token_id=2, tie_word_embeddings=False, use_cache=True, + model_type="cogagent", **kwargs ): self.hidden_size = hidden_size @@ -56,6 +57,7 @@ def __init__( self.hidden_act = hidden_act self.template_version = template_version self.use_cache = use_cache + self.model_type = model_type super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/paddlemix/models/cogagent/modeling.py b/paddlemix/models/cogvlm/modeling.py similarity index 87% rename from paddlemix/models/cogagent/modeling.py rename to paddlemix/models/cogvlm/modeling.py index 84cddc329..fc1585a34 100644 --- a/paddlemix/models/cogagent/modeling.py +++ b/paddlemix/models/cogvlm/modeling.py @@ -28,8 +28,8 @@ ) from paddlenlp.transformers.model_utils import PretrainedModel -from .configuration import CogAgentConfig -from .cross_visual import CrossVisionModel, EVA2CLIPModel +from .configuration import CogModelConfig +from .visual import CrossVisionModel, EVA2CLIPModel if TYPE_CHECKING: logger = transformers.utils.logging.get_logger(__name__) @@ -362,16 +362,18 @@ def forward( return attn_output, None, past_key_value -class CogAgentDecoderLayer(paddle.nn.Layer): +class CogModelDecoderLayer(paddle.nn.Layer): def __init__(self, config): super().__init__() + self.model_type = config.model_type self.hidden_size = config.hidden_size self.self_attn = VisionExpertAttention(config=config) - self.cross_attn = CrossAttention(config=config) + if self.model_type == "cogagent": + self.cross_attn = CrossAttention(config=config) + self.post_cross_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.mlp = VisionExpertMLP(config) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_cross_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, @@ -387,42 +389,55 @@ def forward( ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) + if self.model_type == "cogagent": + past_key_value = past_key_value[:2] if past_key_value is not None else None hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, - past_key_value=past_key_value[:2] if past_key_value is not None else None, + past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = residual + hidden_states - cross_input = self.post_cross_attention_layernorm(hidden_states) - (attention_output, self_cross_attn_weights, present_cross_key_value) = self.cross_attn( - hidden_states=cross_input, - encoder_outputs=encoder_outputs, - attention_mask=cross_attention_mask, - past_key_value=past_key_value[-2:] if past_key_value is not None else None, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = hidden_states + attention_output - mlp_input = self.post_attention_layernorm(hidden_states) - mlp_output = self.mlp(mlp_input, token_type_ids=token_type_ids) - hidden_states = mlp_output + hidden_states + if self.model_type == "cogagent": + cross_input = self.post_cross_attention_layernorm(hidden_states) + (attention_output, self_cross_attn_weights, present_cross_key_value) = self.cross_attn( + hidden_states=cross_input, + encoder_outputs=encoder_outputs, + attention_mask=cross_attention_mask, + past_key_value=past_key_value[-2:] if past_key_value is not None else None, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = hidden_states + attention_output + mlp_input = self.post_attention_layernorm(hidden_states) + mlp_output = self.mlp(mlp_input, token_type_ids=token_type_ids) + hidden_states = mlp_output + hidden_states + elif self.model_type == "cogvlm": + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states, token_type_ids=token_type_ids) + hidden_states = residual + hidden_states + else: + raise ValueError("model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type)) outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: - outputs += (present_key_value + present_cross_key_value,) + if self.model_type == "cogagent": + outputs += (present_key_value + present_cross_key_value,) + else: + outputs += (present_key_value,) return outputs -class CogAgentPreTrainedModel(PretrainedModel): - config_class = CogAgentConfig +class CogPreTrainedModel(PretrainedModel): + config_class = CogModelConfig base_model_prefix = "model" supports_gradient_checkpointing = False - _no_split_modules = ["CogAgentDecoderLayer", "TransformerLayer", "Block"] + _no_split_modules = ["CogModelDecoderLayer", "TransformerLayer", "Block"] _skip_keys_device_placement = "past_key_values" @@ -455,16 +470,17 @@ def build_position_ids(x, attention_mask): return y -class CogAgentModel(CogAgentPreTrainedModel): +class CogModel(CogPreTrainedModel): def __init__(self, config): super().__init__(config) + self.model_type = config.model_type self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = paddle.nn.Embedding( num_embeddings=config.vocab_size, embedding_dim=config.hidden_size, padding_idx=self.padding_idx ) self.layers = paddle.nn.LayerList( - sublayers=[CogAgentDecoderLayer(config) for _ in range(config.num_hidden_layers)] + sublayers=[CogModelDecoderLayer(config) for _ in range(config.num_hidden_layers)] ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.vision = EVA2CLIPModel(config) @@ -509,7 +525,10 @@ def forward( ) -> Union[Tuple, BaseModelOutputWithPast]: """take care of image_encode, token_type_ids, position_ids and (attention_mask = None is fine)""" if past_key_values is not None: - encoder_outputs = None + if self.model_type == "cogagent" or self.model_type == "cogvlm": + encoder_outputs = None + else: + raise ValueError("model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type)) else: assert input_ids is not None and inputs_embeds is None, f"{input_ids} {inputs_embeds}" if not is_empty(images): @@ -517,10 +536,16 @@ def forward( assert len(input_ids) == len(images), f"{len(input_ids)} {len(images)}" inputs_embeds = self.embed_tokens(input_ids) images_features = self.encode_images(images) - encoder_outputs = self.encode_cross_images(cross_images) + encoder_outputs = None + if self.model_type == "cogagent": + encoder_outputs = self.encode_cross_images(cross_images) + elif self.model_type != "cogvlm": + raise ValueError( + "model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type) + ) images_features = rearrange(images_features, "b n d -> (b n) d") - images_features = images_features.to("float32") + images_features = images_features.to(dtype=inputs_embeds.dtype, device=inputs_embeds.place) inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features) else: if token_type_ids is None: @@ -564,7 +589,6 @@ def llm_forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - """largely copy from llama forward and adapt for CogAgent with `token_type_ids`""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -596,7 +620,7 @@ def llm_forward( inputs_embeds = self.embed_tokens(input_ids) if attention_mask is None: attention_mask = paddle.ones(shape=(batch_size, seq_length_with_past), dtype="bool") - if cross_attention_mask is None: + if self.model_type == "cogagent" and cross_attention_mask is None: cross_attention_mask = paddle.ones(shape=(batch_size, 1), dtype="bool") attention_mask = self._prepare_decoder_attention_mask( @@ -632,7 +656,7 @@ def llm_forward( next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return transformers.model_outputs.BaseModelOutputWithPast( + return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, @@ -689,7 +713,7 @@ def base_history_to_prompt(history, query): return prompt -_history_to_prompt = { +_history_to_prompt_for_cogagent = { "base": base_history_to_prompt, "chat": chat_history_to_prompt, "chat_old": chat_old_history_to_prompt, @@ -697,17 +721,33 @@ def base_history_to_prompt(history, query): } -class CogAgentForCausalLM(CogAgentPreTrainedModel): +def _history_to_prompt_for_cogvlm(signal_type, history, query): + if signal_type == "base": + return query + elif signal_type == "vqa": + answer_format = "Short answer:" + elif signal_type == "chat": + answer_format = "Answer:" + else: + assert False, f"Unknown signal type {signal_type}" + prompt = "" + for i, (old_query, response) in enumerate(history): + prompt += "Question: " + old_query + " {} ".format(answer_format) + response + "\n" + prompt += "Question: {} {}".format(query, answer_format) + return prompt + + +class CogModelForCausalLM(CogPreTrainedModel): _auto_class = "AutoModelForCausalLM" def __init__(self, config): super().__init__(config) - self.model = CogAgentModel(config) + self.model_type = config.model_type + self.model = CogModel(config) self.vocab_size = config.vocab_size self.lm_head = paddle.nn.Linear( in_features=config.hidden_size, out_features=config.vocab_size, bias_attr=False ) - # self._post_init() def get_input_embeddings(self): return self.model.embed_tokens @@ -777,7 +817,7 @@ def forward( if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output - return transformers.model_outputs.CausalLMOutputWithPast( + return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, @@ -877,16 +917,22 @@ def build_conversation_input_ids( template_version: Optional[Literal["base", "chat", "vqa"]] = None ): image_size: int = self.config.vision_config["image_size"] - cross_image_size: int = self.config.cross_image_size + if self.model_type == "cogagent": + cross_image_size: int = self.config.cross_image_size patch_size: int = self.config.vision_config["patch_size"] template_version = template_version or self.config.template_version assert images is None or len(images) <= 1, "not support multi images by now." history = history or [] - text = _history_to_prompt[template_version](history, query) + if self.model_type == "cogagent": + text = _history_to_prompt_for_cogagent[template_version](history, query) + elif self.model_type == "cogvlm": + text = _history_to_prompt_for_cogvlm(template_version, history, query) + else: + raise ValueError("model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type)) + input_ids = [tokenizer.bos_token_id] token_type_ids = [LANGUAGE_TOKEN_TYPE] if images is not None and len(images) == 1: - ori = images transform = paddle.vision.transforms.Compose( [ paddle.vision.transforms.Resize((image_size, image_size), interpolation="bicubic"), @@ -896,17 +942,24 @@ def build_conversation_input_ids( ), ] ) - images = [transform(ori[0])] - cross_transform = paddle.vision.transforms.Compose( - [ - paddle.vision.transforms.Resize((cross_image_size, cross_image_size), interpolation="bicubic"), - paddle.vision.transforms.ToTensor(), - paddle.vision.transforms.Normalize( - (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) - ), - ] - ) - cross_images = [cross_transform(ori[0])] + if self.model_type == "cogagent": + ori = images + images = [transform(ori[0])] + cross_transform = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.Resize((cross_image_size, cross_image_size), interpolation="bicubic"), + paddle.vision.transforms.ToTensor(), + paddle.vision.transforms.Normalize( + (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) + ), + ] + ) + cross_images = [cross_transform(ori[0])] + elif self.model_type == "cogvlm": + images = [transform(images[0])] + cross_images = None + else: + raise ValueError("model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type)) vision_token_num = image_size // patch_size * (image_size // patch_size) + 2 input_ids += [tokenizer.pad_token_id] * vision_token_num token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num diff --git a/paddlemix/models/cogagent/cross_visual.py b/paddlemix/models/cogvlm/visual.py similarity index 97% rename from paddlemix/models/cogagent/cross_visual.py rename to paddlemix/models/cogvlm/visual.py index be881bc67..1b421d4fb 100644 --- a/paddlemix/models/cogagent/cross_visual.py +++ b/paddlemix/models/cogvlm/visual.py @@ -742,8 +742,6 @@ def forward(self, x, return_all_features=False): class LayerNorm(paddle.nn.LayerNorm): - """Subclass torch's LayerNorm (with cast back to input dtype).""" - def forward(self, x: paddle.Tensor): orig_type = x.dtype x = paddle.nn.functional.layer_norm( @@ -1008,6 +1006,7 @@ def forward(self, x): class EVA2CLIPModel(paddle.nn.Layer): def __init__(self, config): super().__init__() + self.model_type = config.model_type vision_config = Namespace(**config.vision_config) self.patch_embedding = PatchEmbedding(vision_config) self.transformer = Transformer(vision_config) @@ -1026,29 +1025,37 @@ def __init__(self, config): ) out_13.stop_gradient = not True self.eoi = out_13 - out_14 = paddle.create_parameter( - shape=paddle.zeros( - shape=[(vision_config.image_size // vision_config.patch_size) ** 2, vision_config.hidden_size] - ).shape, - dtype=paddle.zeros( - shape=[(vision_config.image_size // vision_config.patch_size) ** 2, vision_config.hidden_size] - ) - .numpy() - .dtype, - default_initializer=paddle.nn.initializer.Assign( - paddle.zeros( + if self.model_type == "cogagent": + out_14 = paddle.create_parameter( + shape=paddle.zeros( + shape=[(vision_config.image_size // vision_config.patch_size) ** 2, vision_config.hidden_size] + ).shape, + dtype=paddle.zeros( shape=[(vision_config.image_size // vision_config.patch_size) ** 2, vision_config.hidden_size] ) - ), - ) - out_14.stop_gradient = not True - self.pos_embed = out_14 + .numpy() + .dtype, + default_initializer=paddle.nn.initializer.Assign( + paddle.zeros( + shape=[(vision_config.image_size // vision_config.patch_size) ** 2, vision_config.hidden_size] + ) + ), + ) + out_14.stop_gradient = not True + self.pos_embed = out_14 + elif self.model_type != "cogvlm": + raise ValueError("model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type)) def forward(self, images): x = self.patch_embedding(images) x = self.transformer(x) x = x[:, 1:] - x = self.linear_proj(x + self.pos_embed.unsqueeze(axis=0)) + if self.model_type == "cogagent": + x = self.linear_proj(x + self.pos_embed.unsqueeze(axis=0)) + elif self.model_type == "cogvlm": + x = self.linear_proj(x) + else: + raise ValueError("model_type in config must be cogagent or cogvlm, but got {}".format(self.model_type)) boi = self.boi.expand(shape=[x.shape[0], -1, -1]) eoi = self.eoi.expand(shape=[x.shape[0], -1, -1]) x = paddle.concat(x=(boi, x, eoi), axis=1) diff --git a/tests/models/test_cogagent.py b/tests/models/test_cogvlm.py similarity index 62% rename from tests/models/test_cogagent.py rename to tests/models/test_cogvlm.py index 0d96874bf..e1f2dcc08 100644 --- a/tests/models/test_cogagent.py +++ b/tests/models/test_cogvlm.py @@ -16,13 +16,15 @@ import sys sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")) +import tempfile import unittest import numpy as np import paddle -from paddlemix.models.cogagent.configuration import CogAgentConfig -from paddlemix.models.cogagent.modeling import CogAgentForCausalLM +from paddlemix.models.blip2.Qformer import BertLMHeadModel +from paddlemix.models.cogvlm.configuration import CogModelConfig +from paddlemix.models.cogvlm.modeling import CogModelForCausalLM from tests.models.test_configuration_common import ConfigTester from tests.models.test_modeling_common import ( ModelTesterMixin, @@ -39,6 +41,7 @@ def __init__(self, parent): def get_config(self): test_config = { + "model_type": "cogagent", "bos_token_id": 1, "cross_compute_hidden_size": 1024, "cross_hidden_size": 1024, @@ -72,7 +75,7 @@ def get_config(self): }, "vocab_size": 32000, } - return CogAgentConfig(**test_config) + return CogModelConfig(**test_config) def prepare_config_and_inputs(self): images = ([floats_tensor([3, 224, 224])],) @@ -103,7 +106,7 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict def create_and_check_model(self, images, cross_images, input_ids, attention_mask, token_type_ids, position_ids): - model = CogAgentForCausalLM(config=self.get_config()) + model = CogModelForCausalLM(config=self.get_config()) model.eval() with paddle.no_grad(): result = model( @@ -119,7 +122,7 @@ def create_and_check_model(self, images, cross_images, input_ids, attention_mask class CogAgentForCausalLMTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (CogAgentForCausalLM,) + all_model_classes = (CogModelForCausalLM,) fx_compatible = False test_head_masking = False test_pruning = False @@ -132,7 +135,7 @@ def setUp(self): self.model_tester = CogAgentForCausalLMTester(self) self.config_tester = ConfigTester( self, - config_class=CogAgentConfig, + config_class=CogModelConfig, ) def test_config(self): @@ -173,7 +176,89 @@ def test_model(self): @slow def test_model_from_pretrained(self): - model = CogAgentForCausalLM.from_pretrained("THUDM/cogagent-chat") + model = CogModelForCausalLM.from_pretrained("THUDM/cogagent-chat") + self.assertIsNotNone(model) + + +class CogVLMForCausalLMTester(CogAgentForCausalLMTester): + def get_config(self): + test_config = { + "model_type": "cogvlm", + "bos_token_id": 1, + # "cross_compute_hidden_size": 1024, + # "cross_hidden_size": 1024, + # "cross_image_size": 1120, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2, + "initializer_range": 0.02, + "intermediate_size": 2, + "max_position_embeddings": 2048, + "num_attention_heads": 1, + "num_hidden_layers": 1, + "pad_token_id": 0, + "paddlenlp_version": None, + "rms_norm_eps": 1e-05, + "template_version": "chat", + "tie_word_embeddings": False, + "transformers_version": "4.36.0.dev0", + "vision_config": { + "dropout_prob": 0.0, + "hidden_act": "gelu", + "hidden_size": 8, + "image_size": 224, + "in_channels": 3, + "intermediate_size": 2, + "layer_norm_eps": 1e-06, + "num_heads": 1, + "num_hidden_layers": 1, + "num_positions": 257, + "patch_size": 14, + }, + "vocab_size": 32000, + } + return CogModelConfig(**test_config) + + +class CogVLMForCausalLMTest(CogAgentForCausalLMTest): + def test_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_save_load(out1, out2): + # make sure we don't have nans + out_2 = out2.numpy() + out_2[np.isnan(out_2)] = 0 + + out_1 = out1.numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 5e-5) + + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + if isinstance(model, BertLMHeadModel): + model = model.bert + model.eval() + with paddle.no_grad(): + first = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, save_function=paddle.save) + model = model_class.from_pretrained(tmpdirname) + model.eval() + with paddle.no_grad(): + second = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + # support tuple of tensor + if isinstance(first, tuple) and isinstance(second, tuple): + for tensor1, tensor2 in zip(first, second): + check_save_load(tensor1, tensor2) + else: + check_save_load(first, second) + + @slow + def test_model_from_pretrained(self): + model = CogModelForCausalLM.from_pretrained("THUDM/cogvlm-chat") self.assertIsNotNone(model)