From 9c5dcf40864d0a0096282a92f91ddc67f60fcf15 Mon Sep 17 00:00:00 2001 From: Tau <674106399@qq.com> Date: Mon, 6 Feb 2023 21:48:43 +0800 Subject: [PATCH] [Docs] update codec docs (#1952) --- docs/en/user_guides/codecs.md | 46 ++++++++++++++++++++++++------- docs/zh_cn/user_guides/codecs.md | 47 ++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 21 deletions(-) diff --git a/docs/en/user_guides/codecs.md b/docs/en/user_guides/codecs.md index 9381031f42..ca6ebccf63 100644 --- a/docs/en/user_guides/codecs.md +++ b/docs/en/user_guides/codecs.md @@ -26,11 +26,9 @@ The encoder transforms the coordinates in the input image space into the needed For example, in the Regression-based method, the encoder will be: ```Python -def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None -) -> Tuple[np.ndarray, np.ndarray]: +def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints from input image space to normalized space. Args: @@ -39,13 +37,12 @@ def encode( (N, K) Returns: - tuple: - - reg_labels (np.ndarray): The normalized regression labels in + dict: + - keypoint_labels (np.ndarray): The normalized regression labels in shape (N, K, D) where D is 2 for 2d coordinates - keypoint_weights (np.ndarray): The target weights in shape (N, K) """ - if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) @@ -54,10 +51,39 @@ def encode( (keypoints <= [w - 1, h - 1])).all(axis=-1) & ( keypoints_visible > 0.5) - reg_labels = (keypoints / np.array([w, h])).astype(np.float32) + keypoint_labels = (keypoints / np.array([w, h])).astype(np.float32) keypoint_weights = np.where(valid, 1., 0.).astype(np.float32) - return reg_labels, keypoint_weights + encoded = dict( + keypoint_labels=keypoint_labels, keypoint_weights=keypoint_weights) + + return encoded +``` + +The encoded data is converted to Tensor format in `PackPoseInputs` and packed in `data_sample.gt_instance_labels` for model calls, which is generally used for loss calculation, as demonstrated by `loss()` in `RegressionHead`. + +```Python +def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, keypoint_labels, + keypoint_weights.unsqueeze(-1)) + + losses.update(loss_kpt=loss) + ### Omitted ### ``` ### Decoder diff --git a/docs/zh_cn/user_guides/codecs.md b/docs/zh_cn/user_guides/codecs.md index 7c15ee0706..d758b478ee 100644 --- a/docs/zh_cn/user_guides/codecs.md +++ b/docs/zh_cn/user_guides/codecs.md @@ -26,12 +26,9 @@ MMPose 1.0 中引入了新模块 **编解码器(Codec)** ,将关键点数 以 Regression-based 方法的编码器为例: ```Python -@abstractmethod -def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None -) -> Tuple[np.ndarray, np.ndarray]: +def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints from input image space to normalized space. Args: @@ -40,13 +37,12 @@ def encode( (N, K) Returns: - tuple: - - reg_labels (np.ndarray): The normalized regression labels in + dict: + - keypoint_labels (np.ndarray): The normalized regression labels in shape (N, K, D) where D is 2 for 2d coordinates - keypoint_weights (np.ndarray): The target weights in shape (N, K) """ - if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) @@ -55,10 +51,39 @@ def encode( (keypoints <= [w - 1, h - 1])).all(axis=-1) & ( keypoints_visible > 0.5) - reg_labels = (keypoints / np.array([w, h])).astype(np.float32) + keypoint_labels = (keypoints / np.array([w, h])).astype(np.float32) keypoint_weights = np.where(valid, 1., 0.).astype(np.float32) - return reg_labels, keypoint_weights + encoded = dict( + keypoint_labels=keypoint_labels, keypoint_weights=keypoint_weights) + + return encoded +``` + +编码后的数据会在 `PackPoseInputs` 中被转换为 Tensor 格式,并封装到 `data_sample.gt_instance_labels` 中供模型调用,一般主要用于 loss 计算,下面以 `RegressionHead` 中的 `loss()` 为例: + +```Python +def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, keypoint_labels, + keypoint_weights.unsqueeze(-1)) + + losses.update(loss_kpt=loss) + ### 后续内容省略 ### ``` ### 解码器