diff --git a/mmselfsup/models/necks/milan_neck.py b/mmselfsup/models/necks/milan_neck.py index c4afbfd65..29edac6c1 100644 --- a/mmselfsup/models/necks/milan_neck.py +++ b/mmselfsup/models/necks/milan_neck.py @@ -26,6 +26,8 @@ class MILANPretrainDecoder(MAEPretrainDecoder): decoder_depth (int): The depth of decoder. Defaults to 8. decoder_num_heads (int): Number of attention heads of decoder. Defaults to 16. + predict_feature_dim (int): The dimension of the feature to be + predicted. Defaults to 512. mlp_ratio (int): Ratio of mlp hidden dim to decoder's embedding dim. Defaults to 4. norm_cfg (dict): Normalization layer. Defaults to LayerNorm. @@ -41,6 +43,7 @@ def __init__(self, decoder_embed_dim: int = 512, decoder_depth: int = 8, decoder_num_heads: int = 16, + predict_feature_dim: int = 512, mlp_ratio: int = 4, norm_cfg: dict = dict(type='LN', eps=1e-6), init_cfg: Optional[Union[List[dict], dict]] = None) -> None: @@ -58,7 +61,8 @@ def __init__(self, # map the dim of features from decoder to the dim compatible with # that of CLIP - self.decoder_pred = nn.Linear(decoder_embed_dim, 512, bias=True) + self.decoder_pred = nn.Linear( + decoder_embed_dim, predict_feature_dim, bias=True) # use prompt transformer encoder layer, instead of the conventional # transformer encoder layer