guoriyue
diff --git a/‎tests/test_models.py
+52-3 b/‎tests/test_models.py
+52-3
diff --git a/‎timm/models/cait.py
+18-12 b/‎timm/models/cait.py
+18-12
diff --git a/‎timm/models/coat.py
+5-3 b/‎timm/models/coat.py
+5-3
diff --git a/‎timm/models/convit.py
+10-13 b/‎timm/models/convit.py
+10-13
diff --git a/‎timm/models/dla.py
+4-2 b/‎timm/models/dla.py
+4-2
diff --git a/‎timm/models/dpn.py
+3-2 b/‎timm/models/dpn.py
+3-2
diff --git a/‎timm/models/ghostnet.py
+5-4 b/‎timm/models/ghostnet.py
+5-4
@@ -17,7 +17,7 @@
 # transformer models don't support many of the spatial / feature based model functionalities
 NON_STD_FILTERS = [
     'vit_*', 'tnt_*', 'pit_*', 'swin_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
-    'convit_*', 'levit*', 'visformer*']
+    'convit_*', 'levit*', 'visformer*', 'deit*']
 NUM_NON_STD = len(NON_STD_FILTERS)
 
 # exclude models that cause specific test failures
@@ -120,7 +120,6 @@ def test_model_default_cfgs(model_name, batch_size):
     state_dict = model.state_dict()
     cfg = model.default_cfg
 
-    classifier = cfg['classifier']
     pool_size = cfg['pool_size']
     input_size = model.default_cfg['input_size']
 
@@ -149,7 +148,57 @@ def test_model_default_cfgs(model_name, batch_size):
             assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
 
     # check classifier name matches default_cfg
-    assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
+    classifier = cfg['classifier']
+    if not isinstance(classifier, (tuple, list)):
+        classifier = classifier,
+    for c in classifier:
+        assert c + ".weight" in state_dict.keys(), f'{c} not in model params'
+
+    # check first conv(s) names match default_cfg
+    first_conv = cfg['first_conv']
+    if isinstance(first_conv, str):
+        first_conv = (first_conv,)
+    assert isinstance(first_conv, (tuple, list))
+    for fc in first_conv:
+        assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
+
+
+@pytest.mark.timeout(300)
+@pytest.mark.parametrize('model_name', list_models(filter=NON_STD_FILTERS))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_default_cfgs_non_std(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False)
+    model.eval()
+    state_dict = model.state_dict()
+    cfg = model.default_cfg
+
+    input_size = _get_input_size(model_name=model_name, target=TARGET_FWD_SIZE)
+    if max(input_size) > MAX_FWD_SIZE:
+        pytest.skip("Fixed input size model > limit.")
+
+    input_tensor = torch.randn((batch_size, *input_size))
+
+    # test forward_features (always unpooled)
+    outputs = model.forward_features(input_tensor)
+    if isinstance(outputs, tuple):
+        outputs = outputs[0]
+    assert outputs.shape[1] == model.num_features
+
+    # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
+    model.reset_classifier(0)
+    outputs = model.forward(input_tensor)
+    if isinstance(outputs, tuple):
+        outputs = outputs[0]
+    assert len(outputs.shape) == 2
+    assert outputs.shape[1] == model.num_features
+
+    # check classifier name matches default_cfg
+    classifier = cfg['classifier']
+    if not isinstance(classifier, (tuple, list)):
+        classifier = classifier,
+    for c in classifier:
+        assert c + ".weight" in state_dict.keys(), f'{c} not in model params'
 
     # check first conv(s) names match default_cfg
     first_conv = cfg['first_conv']
 
@@ -74,11 +74,11 @@ def _cfg(url='', **kwargs):
 class ClassAttn(nn.Module):
     # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     # with slight modifications to do CA 
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
+        self.scale = head_dim ** -0.5
 
         self.q = nn.Linear(dim, dim, bias=qkv_bias)
         self.k = nn.Linear(dim, dim, bias=qkv_bias)
@@ -110,13 +110,13 @@ class LayerScaleBlockClassAttn(nn.Module):
     # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     # with slight modifications to add CA and LayerScale
     def __init__(
-            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
             drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_block=ClassAttn,
             mlp_block=Mlp, init_values=1e-4):
         super().__init__()
         self.norm1 = norm_layer(dim)
         self.attn = attn_block(
-            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
@@ -134,14 +134,14 @@ def forward(self, x, x_cls):
 class TalkingHeadAttn(nn.Module):
     # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     # with slight modifications to add Talking Heads Attention (https://arxiv.org/pdf/2003.02436v1.pdf)
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
         super().__init__()
 
         self.num_heads = num_heads
 
         head_dim = dim // num_heads
 
-        self.scale = qk_scale or head_dim ** -0.5
+        self.scale = head_dim ** -0.5
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.attn_drop = nn.Dropout(attn_drop)
@@ -177,13 +177,13 @@ class LayerScaleBlock(nn.Module):
     # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
     # with slight modifications to add layerScale
     def __init__(
-            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
             drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_block=TalkingHeadAttn,
             mlp_block=Mlp, init_values=1e-4):
         super().__init__()
         self.norm1 = norm_layer(dim)
         self.attn = attn_block(
-            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
@@ -202,7 +202,7 @@ class Cait(nn.Module):
     # with slight modifications to adapt to our cait models
     def __init__(
             self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
-            num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+            num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0.,
             drop_path_rate=0.,
             norm_layer=partial(nn.LayerNorm, eps=1e-6),
             global_pool=None,
@@ -235,14 +235,14 @@ def __init__(
         dpr = [drop_path_rate for i in range(depth)]
         self.blocks = nn.ModuleList([
             block_layers(
-                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                 drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                 act_layer=act_layer, attn_block=attn_block, mlp_block=mlp_block, init_values=init_scale)
             for i in range(depth)])
 
         self.blocks_token_only = nn.ModuleList([
             block_layers_token(
-                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio_clstk, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio_clstk, qkv_bias=qkv_bias,
                 drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=norm_layer,
                 act_layer=act_layer, attn_block=attn_block_token_only,
                 mlp_block=mlp_block_token_only, init_values=init_scale)
@@ -270,6 +270,13 @@ def _init_weights(self, m):
     def no_weight_decay(self):
         return {'pos_embed', 'cls_token'}
 
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
     def forward_features(self, x):
         B = x.shape[0]
         x = self.patch_embed(x)
@@ -293,7 +300,6 @@ def forward_features(self, x):
     def forward(self, x):
         x = self.forward_features(x)
         x = self.head(x)
-
         return x
 
 
 
@@ -335,6 +335,8 @@ def __init__(
         crpe_window = crpe_window or {3: 2, 5: 3, 7: 3}
         self.return_interm_layers = return_interm_layers
         self.out_features = out_features
+        self.embed_dims = embed_dims
+        self.num_features = embed_dims[-1]
         self.num_classes = num_classes
 
         # Patch embeddings.
@@ -441,10 +443,10 @@ def __init__(
                 # CoaT series: Aggregate features of last three scales for classification.
                 assert embed_dims[1] == embed_dims[2] == embed_dims[3]
                 self.aggregate = torch.nn.Conv1d(in_channels=3, out_channels=1, kernel_size=1)
-                self.head = nn.Linear(embed_dims[3], num_classes)
+                self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
             else:
                 # CoaT-Lite series: Use feature of last scale for classification.
-                self.head = nn.Linear(embed_dims[3], num_classes)
+                self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
 
         # Initialize weights.
         trunc_normal_(self.cls_token1, std=.02)
@@ -471,7 +473,7 @@ def get_classifier(self):
 
     def reset_classifier(self, num_classes, global_pool=''):
         self.num_classes = num_classes
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
 
     def insert_cls(self, x, cls_token):
         """ Insert CLS token. """
 
@@ -57,13 +57,13 @@ def _cfg(url='', **kwargs):
 
 
 class GPSA(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.,
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.,
                  locality_strength=1.):
         super().__init__()
         self.num_heads = num_heads
         self.dim = dim
         head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
+        self.scale = head_dim ** -0.5
         self.locality_strength = locality_strength
 
         self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias)
@@ -142,11 +142,11 @@ def get_rel_indices(self, num_patches: int) -> torch.Tensor:
 
 
 class MHSA(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
+        self.scale = head_dim ** -0.5
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.attn_drop = nn.Dropout(attn_drop)
@@ -191,19 +191,16 @@ def forward(self, x):
 
 class Block(nn.Module):
 
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
                  drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_gpsa=True, **kwargs):
         super().__init__()
         self.norm1 = norm_layer(dim)
         self.use_gpsa = use_gpsa
         if self.use_gpsa:
             self.attn = GPSA(
-                dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
-                proj_drop=drop, **kwargs)
+                dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, **kwargs)
         else:
-            self.attn = MHSA(
-                dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
-                proj_drop=drop, **kwargs)
+            self.attn = MHSA(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
@@ -220,7 +217,7 @@ class ConViT(nn.Module):
     """
 
     def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
-                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, drop_rate=0., attn_drop_rate=0.,
                  drop_path_rate=0., hybrid_backbone=None, norm_layer=nn.LayerNorm, global_pool=None,
                  local_up_to_layer=3, locality_strength=1., use_pos_embed=True):
         super().__init__()
@@ -250,13 +247,13 @@ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, em
         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
         self.blocks = nn.ModuleList([
             Block(
-                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                 drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                 use_gpsa=True,
                 locality_strength=locality_strength)
             if i < local_up_to_layer else
             Block(
-                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                 drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
                 use_gpsa=False)
             for i in range(depth)])
 
@@ -288,6 +288,8 @@ def __init__(self, levels, channels, output_stride=32, num_classes=1000, in_chan
         self.num_features = channels[-1]
         self.global_pool, self.fc = create_classifier(
             self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
@@ -314,6 +316,7 @@ def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
         self.global_pool, self.fc = create_classifier(
             self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
 
     def forward_features(self, x):
         x = self.base_layer(x)
@@ -331,8 +334,7 @@ def forward(self, x):
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.fc(x)
-        if not self.global_pool.is_identity():
-            x = x.flatten(1)  # conv classifier, flatten if pooling isn't pass-through (disabled)
+        x = self.flatten(x)
         return x
 
 
 
@@ -237,6 +237,7 @@ def __init__(self, small=False, num_init_features=64, k_r=96, groups=32,
         # Using 1x1 conv for the FC layer to allow the extra pooling scheme
         self.global_pool, self.classifier = create_classifier(
             self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
 
     def get_classifier(self):
         return self.classifier
@@ -245,6 +246,7 @@ def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
         self.global_pool, self.classifier = create_classifier(
             self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
 
     def forward_features(self, x):
         return self.features(x)
@@ -255,8 +257,7 @@ def forward(self, x):
         if self.drop_rate > 0.:
             x = F.dropout(x, p=self.drop_rate, training=self.training)
         x = self.classifier(x)
-        if not self.global_pool.is_identity():
-            x = x.flatten(1)  # conv classifier, flatten if pooling isn't pass-through (disabled)
+        x = self.flatten(x)
         return x
 
 
 
@@ -133,7 +133,7 @@ def forward(self, x):
 
 
 class GhostNet(nn.Module):
-    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2, in_chans=3, output_stride=32):
+    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2, in_chans=3, output_stride=32, global_pool='avg'):
         super(GhostNet, self).__init__()
         # setting of inverted residual blocks
         assert output_stride == 32, 'only output_stride==32 is valid, dilation not supported'
@@ -178,9 +178,10 @@ def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2, in_chans=3, o
 
         # building last several layers
         self.num_features = out_chs = 1280
-        self.global_pool = SelectAdaptivePool2d(pool_type='avg')
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
         self.conv_head = nn.Conv2d(prev_chs, out_chs, 1, 1, 0, bias=True)
         self.act2 = nn.ReLU(inplace=True)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()  # don't flatten if pooling disabled
         self.classifier = Linear(out_chs, num_classes)
 
     def get_classifier(self):
@@ -190,6 +191,7 @@ def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
         # cannot meaningfully change pooling of efficient head after creation
         self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.flatten = nn.Flatten(1) if global_pool else nn.Identity()  # don't flatten if pooling disabled
         self.classifier = Linear(self.pool_dim, num_classes) if num_classes > 0 else nn.Identity()
 
     def forward_features(self, x):
@@ -204,8 +206,7 @@ def forward_features(self, x):
 
     def forward(self, x):
         x = self.forward_features(x)
-        if not self.global_pool.is_identity():
-            x = x.view(x.size(0), -1)
+        x = self.flatten(x)
         if self.dropout > 0.:
             x = F.dropout(x, p=self.dropout, training=self.training)
         x = self.classifier(x)