Faster RCNN

前几篇写了yolo系列的模型训练以及模型优化,加速,这篇是关于Faster RCNN。Faster RCNN所用的数据集是来自yolo v3,基本可以无缝使用。数据集参考:https://blog.csdn.net/kui9702/article/details/122954209

本节代码 GitHub:https://github.com/kile97/faster-rcnn-pytorch 或者 https://gitee.com/kui9702_kile/faster-rcnn-pytorch-master

Faster RCNN的网络结构主要分为3部分:

backbone 骨干网络

​ 模型的主体网络,提取特征

RegionProposalNetwork 区域建议网络

​ 将backbone提取的特征转化为特定的分类信息与坐标信息,并将特定的信息与原图进行映射,利用nms保留所需的建议框输出给RoIHead

RoIHead 兴趣区域网络

​ 对建议框进行RoI Pooling,获得最终的分类信息和坐标(位置)信息

Faster RCNN 整体结构
# 配置anchor_sizes 这里选用的是我在之前的yolo v3中kmeans聚类的一些数据,这里会生成len(anchor_sizes) * len(aspect_ratios)个anchor box
anchor_sizes = ((25,35,50,90,120),)
# anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
aspect_ratios = ((0.5, 1.0, 2.0),)
# 定义rpn的anchor box生成类
rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
# 定义Fasterrcnn 模型
model = FasterRCNN(backbone, num_classes, rpn_anchor_generator=rpn_anchor_generator)

Backbone

backbone 选用Resnet50,这里使用_resnet_fpn_extractor函数对Resnet50进行公用特征层的分离,从这部分网络获取

# 获取resnet50_backbone, 可以选自torchvision的resnet50预训练模型或者faster rcnn预训练模型
 backbone = resnet50(pretrained=pretrained_backbone, progress=True, norm_layer=FrozenBatchNorm2d)
 # 用_resnet_fpn_extractor获取Faster的骨干网络,并且获得backbone需要保存的特征层为backbone的[1,2,3,4]
 backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)

class BackboneWithFPN(nn.Module):
    """
    特征提取:由backbone和FPN组成
    """

    def __init__(
        self,
        backbone: nn.Module,
        return_layers: Dict[str, str],
        in_channels_list: List[int],
        out_channels: int,
        extra_blocks: Optional[ExtraFPNBlock] = None,
    ) -> None:
        super().__init__()

        if extra_blocks is None:
            extra_blocks = LastLevelMaxPool()

        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
        self.fpn = FeaturePyramidNetwork(
            in_channels_list=in_channels_list,
            out_channels=out_channels,
            extra_blocks=extra_blocks,
        )
        self.out_channels = out_channels

    def forward(self, x: Tensor) -> Dict[str, Tensor]:
        # 这里获得4组feature_map
        x = self.body(x)
        # 将特征做特征金字塔,得到更好、更细的features
        x = self.fpn(x)
        return x

class IntermediateLayerGetter(nn.ModuleDict):
    """
    Module wrapper that returns intermediate layers from a model
urn_layers
        return_layers = {str(k): str(v) for k, v in return_layers.items()}
        layers = OrderedDict()
        for name, module in model.named_children():
            layers[name] = module
            if name in return_layers:
                del return_layers[name]
            if not return_layers:
                break

        super().__init__(layers)
        self.return_layers = orig_return_layers

    def forward(self, x):
        # 定义输出为一个有序的字典
        out = OrderedDict()
        # 遍历模型每个层,这里指conv1、layer1、layer2、layer3等,具体可以从resnet的模型结构图查看,获取backbone.return_layers={'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
        # 的特征
        for name, module in self.items():
            x = module(x)
            if name in self.return_layers:
                out_name = self.return_layers[name]
                out[out_name] = x
        return out

class FeaturePyramidNetwork(nn.Module):
    """
    Module that adds a FPN from on top of a set of feature maps. This is based on
    `"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.

    The feature maps are currently supposed to be in increasing depth
    order.

    The input to the model is expected to be an OrderedDict[Tensor], containing
    the feature maps on top of which the FPN will be added.

    Args:
        in_channels_list (list[int]): number of channels for each feature map that
            is passed to the module
        out_channels (int): number of channels of the FPN representation
        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
            be performed. It is expected to take the fpn features, the original
            features and the names of the original features as input, and returns
            a new list of feature maps and their corresponding names

    Examples::

        >>> m = torchvision.ops.FeaturePyramidNetwork([10, 20, 30], 5)
        >>> # get some dummy data
        >>> x = OrderedDict()
        >>> x['feat0'] = torch.rand(1, 10, 64, 64)
        >>> x['feat2'] = torch.rand(1, 20, 16, 16)
        >>> x['feat3'] = torch.rand(1, 30, 8, 8)
        >>> # compute the FPN on top of x
        >>> output = m(x)
        >>> print([(k, v.shape) for k, v in output.items()])
        >>> # returns
        >>>   [('feat0', torch.Size([1, 5, 64, 64])),
        >>>    ('feat2', torch.Size([1, 5, 16, 16])),
        >>>    ('feat3', torch.Size([1, 5, 8, 8]))]

    """

    def __init__(
        self,
        in_channels_list: List[int],
        out_channels: int,
        extra_blocks: Optional[ExtraFPNBlock] = None,
    ):
        super().__init__()
        _log_api_usage_once(self)
        self.inner_blocks = nn.ModuleList()
        self.layer_blocks = nn.ModuleList()
        for in_channels in in_channels_list:
            if in_channels == 0:
                raise ValueError("in_channels=0 is currently not supported")
            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
            self.inner_blocks.append(inner_block_module)
            self.layer_blocks.append(layer_block_module)

        # initialize parameters now to avoid modifying the initialization of top_blocks
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight, a=1)
                nn.init.constant_(m.bias, 0)

        if extra_blocks is not None:
            assert isinstance(extra_blocks, ExtraFPNBlock)
        self.extra_blocks = extra_blocks

    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
        """
        This is equivalent to self.inner_blocks[idx](x),
        but torchscript doesn't support this yet
        """
        num_blocks = len(self.inner_blocks)
        if idx < 0:
            idx += num_blocks
        out = x
        for i, module in enumerate(self.inner_blocks):
            if i == idx:
                out = module(x)
        return out

    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
        """
        This is equivalent to self.layer_blocks[idx](x),
        but torchscript doesn't support this yet
        """
        num_blocks = len(self.layer_blocks)
        if idx < 0:
            idx += num_blocks
        out = x
        for i, module in enumerate(self.layer_blocks):
            if i == idx:
                out = module(x)
        return out

    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """
        Computes the FPN for a set of feature maps.

        Args:
            x (OrderedDict[Tensor]): feature maps for each feature level.

        Returns:
            results (OrderedDict[Tensor]): feature maps after FPN layers.
                They are ordered from highest resolution first.
        """
        # 获取4个特征层名
        names = list(x.keys())
        # 获取4组feature_map
        x = list(x.values())

        # 获取最后一层的feature_map
        # 相当于 last_inner = self.inner_blocks[-1](x)
        last_inner = self.get_result_from_inner_blocks(x[-1], -1)
        results = []
        # 相当于results.append(self.layer_blocks[-1](last_inner))
        results.append(self.get_result_from_layer_blocks(last_inner, -1))

        # 将其余的特征层做FPN(特征金字塔)提取特征
        # 特征金字塔的操作是将顶层特征进行采样上采样与底层特征进行融合(相加)
        for idx in range(len(x) - 2, -1, -1):
            inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
            feat_shape = inner_lateral.shape[-2:]
            # 对last_inner进行最近邻插值算法上采样操作
            inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
            # 对inner_lateral和inner_top_down进行相加
            last_inner = inner_lateral + inner_top_down
            # 将特征保存起来
            results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))

        # extra_blocks为自己设置,如果未设置则为最大池化,则在原来的基础上增加池化层
        if self.extra_blocks is not None:
            results, names = self.extra_blocks(results, x, names)

        # make it back an OrderedDict
        out = OrderedDict([(k, v) for k, v in zip(names, results)])
        # 这里输出的结果shape为{'0':[2,256,200,240],'1':[2,256,100,120],'2':[2,256,50,60],'3':[2,256,25,30],'pool':[2,256,13,15]}
        return out

Faster RCNN是继承GeneralizedRCNN类,并实现GeneralizedRCNN的forward函数

class GeneralizedRCNN(nn.Module):
    """
    Faster RCNN的实现类
    """

    def __init__(self, backbone: nn.Module, rpn: nn.Module, roi_heads: nn.Module, transform: nn.Module) -> None:
        super().__init__()
        # 记录torchvsion、torch的api使用情况
        _log_api_usage_once(self)
        # 将变量赋值给类变量
        self.transform = transform
        self.backbone = backbone
        self.rpn = rpn
        self.roi_heads = roi_heads
        # used only on torchscript mode
        self._has_warned = False

    @torch.jit.unused
    def eager_outputs(self, losses, detections):
        # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Union[Dict[str, Tensor], List[Dict[str, Tensor]]]
        if self.training:
            return losses

        return detections

    def forward(self, images, targets=None):
        # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
        """
        Args:
            images (list[Tensor]): images to be processed
            targets (list[Dict[str, Tensor]]): ground-truth boxes present in the image (optional)

        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During training, it returns a dict[Tensor] which contains the losses.
                During testing, it returns list[BoxList] contains additional fields
                like `scores`, `labels` and `mask` (for Mask R-CNN models).

        """
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
        if self.training:
            assert targets is not None
            for target in targets:
                # 获取boxes的标签
                boxes = target["boxes"]
                if isinstance(boxes, torch.Tensor):
                    if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
                        raise ValueError(f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.")
                else:
                    raise ValueError(f"Expected target boxes to be of type Tensor, got {type(boxes)}.")

        original_image_sizes: List[Tuple[int, int]] = []
        #遍历图片
        for img in images:
            # img.shape为[channels, height, weigth]
            val = img.shape[-2:]
            assert len(val) == 2
            # 保存图片的原始大小
            original_image_sizes.append((val[0], val[1]))
        # 对图片进行预处理 将images从Tensor()转换成ImageLists,储存Tensor和原始图片的size
        images, targets = self.transform(images, targets)

        # Check for degenerate boxes
        # TODO: Move this to a function
        if targets is not None:
            for target_idx, target in enumerate(targets):
                # 对图片进行判别,判断图片的标注是否存在客观问题,比如数值越界
                boxes = target["boxes"]
                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
                if degenerate_boxes.any():
                    # print the first degenerate box
                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
                    degen_bb: List[float] = boxes[bb_idx].tolist()
                    raise ValueError(
                        "All bounding boxes should have positive height and width."
                        f" Found invalid box {degen_bb} for target at index {target_idx}."
                    )
        # 提取图片的特征
        features = self.backbone(images.tensors)
        if isinstance(features, torch.Tensor):
            features = OrderedDict([("0", features)])
        # 将图片(Tensor)、backbone提取的特征、标签送入到区域建议网络(rpn),如果是训练模式,获取区域建议框和对应的分类loss、定位loss
        proposals, proposal_losses = self.rpn(images, features, targets)
        # 将特征、区域建议框、原始图片尺寸、标签送入兴趣区域网络(RoI Head),如果是训练模式的话,获取分类损失和定位损失并且detections=[],如果为验证模式(model.eval()),则获取预测框的定位信息以及预测框所对应的分类信息,detector_losses=[]
        detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
        # 如果是训练模式,detections=[];如果是验证模式,获取预测框信息
        detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)  # type: ignore[operator]

        losses = {}
        losses.update(detector_losses)
        losses.update(proposal_losses)

        if torch.jit.is_scripting():
            if not self._has_warned:
                warnings.warn("RCNN always returns a (Losses, Detections) tuple in scripting")
                self._has_warned = True
            return losses, detections
        else:
            return self.eager_outputs(losses, detections)

RegionProposalNetwork 区域建议网络

筛选出符合要求、符合条件的候选框

# 自定义rpn类
rpn = RegionProposalNetwork(
      rpn_anchor_generator,   # anchor box生成器,下面介绍
      rpn_head,   # 简单的分类和回归分类器,下面会介绍
      rpn_fg_iou_thresh,  # 物体的iou阈值
      rpn_bg_iou_thresh,  # 背景的iou阈值
      rpn_batch_size_per_image,   # 每个batch需要对比的图片数量
      rpn_positive_fraction,  # 每个batch图片中正负样本的比例,比如rpn_batch_size_per_image为256,rpn_positive_fraction为0.5,则正负样本分别为128
      rpn_pre_nms_top_n,  # 训练前 nms需要保留的候选框数量
      rpn_post_nms_top_n, # 训练后 nms需要保留的候选框数量
      rpn_nms_thresh,   # nms的iou阈值
      score_thresh=rpn_score_thresh,  # 分数阈值,用来过滤预测分数过低的候选框
  )


class RegionProposalNetwork(torch.nn.Module):
    '''
        RPN网络类
    '''
    __annotations__ = {
        "box_coder": det_utils.BoxCoder,
        "proposal_matcher": det_utils.Matcher,
        "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
    }

    def __init__(
        self,
        anchor_generator: AnchorGenerator,
        head: nn.Module,
        # Faster-RCNN Training
        fg_iou_thresh: float,
        bg_iou_thresh: float,
        batch_size_per_image: int,
        positive_fraction: float,
        # Faster-RCNN Inference
        pre_nms_top_n: Dict[str, int],
        post_nms_top_n: Dict[str, int],
        nms_thresh: float,
        score_thresh: float = 0.0,
    ) -> None:
        super().__init__()
        self.anchor_generator = anchor_generator
        self.head = head
        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))

        # used during training
        self.box_similarity = box_ops.box_iou

        self.proposal_matcher = det_utils.Matcher(
            fg_iou_thresh,
            bg_iou_thresh,
            allow_low_quality_matches=True,
        )

        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
        # used during testing
        self._pre_nms_top_n = pre_nms_top_n
        self._post_nms_top_n = post_nms_top_n
        self.nms_thresh = nms_thresh
        self.score_thresh = score_thresh
        self.min_size = 1e-3

    def pre_nms_top_n(self) -> int:
        if self.training:
            return self._pre_nms_top_n["training"]
        return self._pre_nms_top_n["testing"]

    def post_nms_top_n(self) -> int:
        if self.training:
            return self._post_nms_top_n["training"]
        return self._post_nms_top_n["testing"]

        def assign_targets_to_anchors(
        self, anchors: List[Tensor], targets: List[Dict[str, Tensor]]
    ) -> Tuple[List[Tensor], List[Tensor]]:

        labels = []
        matched_gt_boxes = []
        for anchors_per_image, targets_per_image in zip(anchors, targets):
            gt_boxes = targets_per_image["boxes"]

            if gt_boxes.numel() == 0:
                # Background image (negative example)
                device = anchors_per_image.device
                matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
                labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)
            else:
                match_quality_matrix = self.box_similarity(gt_boxes, anchors_per_image) # 计算anchor box与真实框之间的iou
                matched_idxs = self.proposal_matcher(match_quality_matrix)  #计算每个anchors与gt匹配iou最大的索引(如果iou<0.3索引置为-1,0.3<iou<0.7索引为-2) 这个iou由定义时设置
                # 这里使用clamp设置下限0是为了方便取每个anchors对应的gt_boxes信息
                # 负样本和舍弃的样本都是负值,所以为了防止越界直接置为0
                # 因为后面是通过labels_per_image变量来记录正样本位置的,
                # 所以负样本和舍弃的样本对应的gt_boxes信息并没有什么意义,
                # 反正计算目标边界框回归损失时只会用到正样本。
                matched_gt_boxes_per_image = gt_boxes[matched_idxs.clamp(min=0)]

                # 记录所有anchors匹配后的标签(正样本处标记为1,负样本处标记为0,丢弃样本处标记为 - 2)
                labels_per_image = matched_idxs >= 0
                labels_per_image = labels_per_image.to(dtype=torch.float32)

                # labels_per_image的背景的设置为0,之后作为负样本进行训练
                bg_indices = matched_idxs == self.proposal_matcher.BELOW_LOW_THRESHOLD
                labels_per_image[bg_indices] = 0.0

                # 将背景与阈值之间的值设置为-1.0 之后丢弃 不做训练
                inds_to_discard = matched_idxs == self.proposal_matcher.BETWEEN_THRESHOLDS
                labels_per_image[inds_to_discard] = -1.0

            labels.append(labels_per_image)
            matched_gt_boxes.append(matched_gt_boxes_per_image)
        return labels, matched_gt_boxes

    def _get_top_n_idx(self, objectness: Tensor, num_anchors_per_level: List[int]) -> Tensor:
        r = []
        offset = 0
        for ob in objectness.split(num_anchors_per_level, 1):
            if torchvision._is_tracing():
                num_anchors, pre_nms_top_n = _onnx_get_num_anchors_and_pre_nms_top_n(ob, self.pre_nms_top_n())
            else:
                num_anchors = ob.shape[1]
                pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors)
            _, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
            r.append(top_n_idx + offset)
            offset += num_anchors
        return torch.cat(r, dim=1)

    def filter_proposals(
        self,
        proposals: Tensor,
        objectness: Tensor,
        image_shapes: List[Tuple[int, int]],
        num_anchors_per_level: List[int],
    ) -> Tuple[List[Tensor], List[Tensor]]:

        num_images = proposals.shape[0]     # 获得图片数量
        device = proposals.device
        # do not backprop through objectness
        objectness = objectness.detach()
        objectness = objectness.reshape(num_images, -1)     # 将anchor box的分类信息分配给num_images张图片    shape从

        levels = [      # levels负责记录分隔不同预测特征层上的anchors索引信息
            torch.full((n,), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level)
        ]
        levels = torch.cat(levels, 0)
        levels = levels.reshape(1, -1).expand_as(objectness)

        # select top_n boxes independently per level before applying nms
        top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)  # 获取符合条件的anchor box,留下的anchor box为len(anchor_size) * pre_nms_num

        image_range = torch.arange(num_images, device=device)
        batch_idx = image_range[:, None]

        objectness = objectness[batch_idx, top_n_idx]   # 获取每个预测特征层预测概率排前pre_nms_top_n的anchors索引值获取相应概率信息
        levels = levels[batch_idx, top_n_idx]
        proposals = proposals[batch_idx, top_n_idx]     # 获取概率排前pre_nms_top_n的anchors索引值获取相应bbox位置信息

        objectness_prob = torch.sigmoid(objectness)     # 获取pre_nms_top_n中每个anchor的分类值

        final_boxes = []
        final_scores = []
        for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
            boxes = box_ops.clip_boxes_to_image(boxes, img_shape)   # 调整anchor box的实际信息,防止越界

            # remove small boxes
            keep = box_ops.remove_small_boxes(boxes, self.min_size)     # 移除掉不符合我们要求的小框
            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]

            # remove low scoring boxes
            # use >= for Backwards compatibility
            keep = torch.where(scores >= self.score_thresh)[0]      # 去除一些分值较低的anchor box
            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]

            # non-maximum suppression, independently done per level
            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)     # 去除重复叠加在一起的框 nms

            # keep only topk scoring predictions
            keep = keep[: self.post_nms_top_n()]     # 留下self.post_nms_top_n()个anchor box 用于训练
            boxes, scores = boxes[keep], scores[keep]

            final_boxes.append(boxes)
            final_scores.append(scores)
        return final_boxes, final_scores

    def compute_loss(
        self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: List[Tensor], regression_targets: List[Tensor]
    ) -> Tuple[Tensor, Tensor]:
        """
        Args:
            objectness (Tensor)
            pred_bbox_deltas (Tensor)
            labels (List[Tensor])
            regression_targets (List[Tensor])

        Returns:
            objectness_loss (Tensor)
            box_loss (Tensor)
        """

        # 按照我们自定义的数值 选择合适的正负样本
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
        # 将batch里面的样本id拼接起来,并且获得正样本的索引
        sampled_pos_inds = torch.where(torch.cat(sampled_pos_inds, dim=0))[0]   
        # 将batch里面的样本id拼接起来,并且获得负样本的索引
        sampled_neg_inds = torch.where(torch.cat(sampled_neg_inds, dim=0))[0]   
        # 将正负样本的id拼接起来
        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)   
        
        # 将objectness展平为标量 原来是[n, 1]变为[n]
        objectness = objectness.flatten()   
        
        # 将label叠加起来 并且展平为标量
        labels = torch.cat(labels, dim=0)   
        regression_targets = torch.cat(regression_targets, dim=0)
        
        # 计算box损失
        box_loss = (            
            F.smooth_l1_loss(
                pred_bbox_deltas[sampled_pos_inds],
                regression_targets[sampled_pos_inds],
                beta=1 / 9,
                reduction="sum",
            )
            / (sampled_inds.numel())
        )
        # 计算二分类损失
        objectness_loss = F.binary_cross_entropy_with_logits(objectness[sampled_inds], labels[sampled_inds])

        return objectness_loss, box_loss

        def forward(
            self,
            images: ImageList,
            features: Dict[str, Tensor],
            targets: Optional[List[Dict[str, Tensor]]] = None,
    ) -> Tuple[List[Tensor], Dict[str, Tensor]]:

        """
        Args:
            images (ImageList): images for which we want to compute the predictions
            features (Dict[str, Tensor]): features computed from the images that are
                used for computing the predictions. Each tensor in the list
                correspond to different feature levels
            targets (List[Dict[str, Tensor]]): ground-truth boxes present in the image (optional).
                If provided, each element in the dict should contain a field `boxes`,
                with the locations of the ground-truth boxes.

        Returns:
            boxes (List[Tensor]): the predicted boxes from the RPN, one Tensor per
                image.
            losses (Dict[str, Tensor]): the losses for the model during training. During
                testing, it is an empty dict.
        """
        # 将特征转化为List
        features = list(features.values())
        # 获取2分类信息和预测框信息
        objectness, pred_bbox_deltas = self.head(features)
        # 获取自定义的anchor box
        anchors = self.anchor_generator(images, features)

        num_images = len(anchors)
        # 获取每个特征层的shape
        num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
        # 获取每个特征层的像素点数量
        num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
        # 获取每个anchor box的分类信息和位置信息
        objectness, pred_bbox_deltas = concat_box_prediction_layers(objectness, pred_bbox_deltas)
        # 将pred_bbox_delta应用于锚,以获得解码的提案注意,我们分离了delta,因为更快的R-CNN不会支持提案
        proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)   # 将预测的feature_map解码为与自定义anchor box相同格式的预测框信息
        proposals = proposals.view(num_images, -1, 4)   # 将proposals分配给batch的每张图片
        boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)    # 获取训练的anchor box

        losses = {}
        if self.training:
            assert targets is not None
            labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets) # 计算每个anchors最匹配的gt,并将anchors进行分类,前景,背景以及废弃的anchors
            regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)   # 获取真实的坐标(gt box)回归参数
            loss_objectness, loss_rpn_box_reg = self.compute_loss(      # 计算损失
                objectness, pred_bbox_deltas, labels, regression_targets
            )
            losses = {
                "loss_objectness": loss_objectness,
                "loss_rpn_box_reg": loss_rpn_box_reg,
            }
        return boxes, losses

class RPNHead(nn.Module):
    """
    Adds a simple RPN Head with classification and regression heads

    Args:
        in_channels (int): number of channels of the input feature
        num_anchors (int): number of anchors to be predicted
    """

    def __init__(self, in_channels: int, num_anchors: int) -> None:
        super().__init__()
        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)

        for layer in self.children():
            torch.nn.init.normal_(layer.weight, std=0.01)  # type: ignore[arg-type]
            torch.nn.init.constant_(layer.bias, 0)  # type: ignore[arg-type]

    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
        logits = []
        bbox_reg = []
        # 获取每一层的分类信息(只做背景与非背景的分类)和位置信息
        for feature in x:
            t = F.relu(self.conv(feature))
            logits.append(self.cls_logits(t))
            bbox_reg.append(self.bbox_pred(t))
        return logits, bbox_reg

def concat_box_prediction_layers(box_cls: List[Tensor], box_regression: List[Tensor]) -> Tuple[Tensor, Tensor]:
    box_cls_flattened = []
    box_regression_flattened = []
    # 对于每个要素级别,排列输出,使其与标签的格式相同。请注意,标签是为所有连接的要素级别计算的,因此我们对对象性和box_回归保持相同的表示
    for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
        #获取数量、channels、高和宽
        N, AxC, H, W = box_cls_per_level.shape
        Ax4 = box_regression_per_level.shape[1]
        # 每一个像素点anchor box的数量
        A = Ax4 // 4
        # 种类数num_class 这里只有前景和背景
        C = AxC // A
        # 将[N, AxC, H, W]转化为[N, -1, 1],获取每一个候选框(anchor box)的分类信息(前景与后景的分类概率)
        box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
        box_cls_flattened.append(box_cls_per_level)
        # 将[N, AxC, H, W]转化为[N, -1, 4],获取每一个候选框的位置信息
        box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
        box_regression_flattened.append(box_regression_per_level)
    # 获取每一个anchor box的分类信息(分数)
    box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2)
    # 获取每一个anchor box的位置信息
    box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)
    return box_cls, box_regression

class BoxCoder:
    """
    This class encodes and decodes a set of bounding boxes into
    the representation used for training the regressors.
    """

    def __init__(
        self, weights: Tuple[float, float, float, float], bbox_xform_clip: float = math.log(1000.0 / 16)
    ) -> None:
        """
        Args:
            weights (4-element tuple)
            bbox_xform_clip (float)
        """
        self.weights = weights
        self.bbox_xform_clip = bbox_xform_clip

    def encode(self, reference_boxes: List[Tensor], proposals: List[Tensor]) -> List[Tensor]:
        # 获取batch里面每张图片的预测框个数
        boxes_per_image = [len(b) for b in reference_boxes]
        # 将预测框拼接起来
        reference_boxes = torch.cat(reference_boxes, dim=0)
        # 将自定义的anchor box也进行拼接
        proposals = torch.cat(proposals, dim=0)
        # 编码过程 将预测框与我们自定义的anchor box进行编码 获取与feature_map输出的结果成同一量纲
        targets = self.encode_single(reference_boxes, proposals)
        return targets.split(boxes_per_image, 0)

    def encode_single(self, reference_boxes: Tensor, proposals: Tensor) -> Tensor:
        """
        Encode a set of proposals with respect to some
        reference boxes

        Args:
            reference_boxes (Tensor): reference boxes
            proposals (Tensor): boxes to be encoded
        """
        dtype = reference_boxes.dtype
        device = reference_boxes.device
        weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
        targets = encode_boxes(reference_boxes, proposals, weights)

        return targets

    def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor:
        assert isinstance(boxes, (list, tuple))
        assert isinstance(rel_codes, torch.Tensor)
        # 获得自定义anchor box的数量
        boxes_per_image = [b.size(0) for b in boxes]
        concat_boxes = torch.cat(boxes, dim=0) # 将batch里面的所有anchor box叠加起来比如batch([185460, 4],[185460, 4])变为([370920, 4])
        box_sum = 0
        for val in boxes_per_image:
            box_sum += val
        if box_sum > 0:
            rel_codes = rel_codes.reshape(box_sum, -1) # [370920, 4]
        pred_boxes = self.decode_single(rel_codes, concat_boxes)    # 获取所有anchor box实际的预测框位置信息
        if box_sum > 0:
            pred_boxes = pred_boxes.reshape(box_sum, -1, 4)     # 将[n, 4]reshape为[n, -1, 4]
        return pred_boxes

        def decode_single(self, rel_codes: Tensor, boxes: Tensor) -> Tensor:
        """
        From a set of original boxes and encoded relative box offsets,
        get the decoded boxes.

        Args:
            rel_codes (Tensor): encoded boxes
            boxes (Tensor): reference boxes.
        """

        boxes = boxes.to(rel_codes.dtype)

        widths = boxes[:, 2] - boxes[:, 0]  # 获取自定义anchor box的宽度
        heights = boxes[:, 3] - boxes[:, 1] # 获取自定义anchor box的高度
        ctr_x = boxes[:, 0] + 0.5 * widths  # 获取自定义anchor box的xmin,并且转移到自定义anchor box中间
        ctr_y = boxes[:, 1] + 0.5 * heights # 获取自定义anchor box的ymin,并且转移到自定义anchor box中间

        wx, wy, ww, wh = self.weights    # 预测框的回归参数 rel_codes为[n, 4]
        dx = rel_codes[:, 0::4] / wx    # 获取rel_codes中所有的x坐标,每个x坐标为一个维度,shape为[n, 1],下面的一样
        dy = rel_codes[:, 1::4] / wy    # 获取rel_codes中所有的y坐标
        dw = rel_codes[:, 2::4] / ww    # 获取rel_codes中所有的宽度
        dh = rel_codes[:, 3::4] / wh    # 获取rel_codes中所有的高度

        # 避免dw, dh超过最大值
        dw = torch.clamp(dw, max=self.bbox_xform_clip)
        dh = torch.clamp(dh, max=self.bbox_xform_clip)

        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]  # 获取预测框的x坐标(中心坐标)
        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]   # 获取预测框的y坐标(中心坐标)
        pred_w = torch.exp(dw) * widths[:, None]     # 获取预测框的宽度
        pred_h = torch.exp(dh) * heights[:, None]     # 获取预测框的高度

        # Distance from center to box's corner.
        c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h     # 获取预测框的高度的一半
        c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w     # 获取预测框的宽度的一半

        pred_boxes1 = pred_ctr_x - c_to_c_w     # 获取预测框的xmin
        pred_boxes2 = pred_ctr_y - c_to_c_h     # 获取预测框的ymin
        pred_boxes3 = pred_ctr_x + c_to_c_w     # 获取预测框的xmax
        pred_boxes4 = pred_ctr_y + c_to_c_h     # 预测预测框的ymax
        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)        # 获得每组的左上角坐标和左下角下标
        return pred_boxes

@torch.jit._script_if_tracing
def encode_boxes(reference_boxes: Tensor, proposals: Tensor, weights: Tensor) -> Tensor:
    """
    Encode a set of proposals with respect to some
    reference boxes

    Args:
        reference_boxes (Tensor): reference boxes
        proposals (Tensor): boxes to be encoded
        weights (Tensor[4]): the weights for ``(x, y, w, h)``
    """

    # 获取之前设置的预测框的回归参数
    wx = weights[0]
    wy = weights[1]
    ww = weights[2]
    wh = weights[3]

    # 获取自定义的anchor box的xmin,ymin,xmax,ymax
    proposals_x1 = proposals[:, 0].unsqueeze(1)
    proposals_y1 = proposals[:, 1].unsqueeze(1)
    proposals_x2 = proposals[:, 2].unsqueeze(1)
    proposals_y2 = proposals[:, 3].unsqueeze(1)

    # 获取预测框的xmin,ymin,xmax,ymax
    reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
    reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
    reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
    reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)

    # 获取自定义的anchor box的宽度、高度
    ex_widths = proposals_x2 - proposals_x1
    ex_heights = proposals_y2 - proposals_y1
    # 获取自定义的anchor box的中心点x,y
    ex_ctr_x = proposals_x1 + 0.5 * ex_widths
    ex_ctr_y = proposals_y1 + 0.5 * ex_heights

    # 获取预测框的宽度、高度
    gt_widths = reference_boxes_x2 - reference_boxes_x1
    gt_heights = reference_boxes_y2 - reference_boxes_y1
    # 获取预测框的中心点x,y
    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights

    # 归一到与模型输出的feature_map相同量纲 这是编码过程 与 解码过程是一一对应的,可以看BoxCoder.decode_single()
    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
    targets_dw = ww * torch.log(gt_widths / ex_widths)
    targets_dh = wh * torch.log(gt_heights / ex_heights)

    # 将targets_dx, targets_dy, targets_dw, targets_dh拼接起来
    targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
    return targets

将7x7的输入转化为1024的特征向量

class TwoMLPHead(nn.Module):
    """
    Standard heads for FPN-based models

    Args:
        in_channels (int): number of input channels
        representation_size (int): size of the intermediate representation
    """

    def __init__(self, in_channels, representation_size):
        super().__init__()

        self.fc6 = nn.Linear(in_channels, representation_size)
        self.fc7 = nn.Linear(representation_size, representation_size)

    def forward(self, x):
        x = x.flatten(start_dim=1)

        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))

        return x

Faster RCNN的分类层与bounding box预测回归层

class FastRCNNPredictor(nn.Module):
"""
Standard classification + bounding box regression layers
for Fast R-CNN.

Args:
  in_channels (int): number of input channels
  num_classes (int): number of output classes (including background)
"""

def __init__(self, in_channels, num_classes):
  super().__init__()
self.cls_score = nn.Linear(in_channels, num_classes)
self.bbox_pred = nn.Linear(in_channels, num_classes * 4)

def forward(self, x):
  if x.dim() == 4:
      assert list(x.shape[2:]) == [1, 1]
  x = x.flatten(start_dim=1)
  scores = self.cls_score(x)
bbox_deltas = self.bbox_pred(x)

  return scores, bbox_deltas

class RoIHeads(nn.Module):
    __annotations__ = {
        "box_coder": det_utils.BoxCoder,
        "proposal_matcher": det_utils.Matcher,
        "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
    }

    def __init__(
        self,
        box_roi_pool,
        box_head,
        box_predictor,
        # Faster R-CNN training
        fg_iou_thresh,
        bg_iou_thresh,
        batch_size_per_image,
        positive_fraction,
        bbox_reg_weights,
        # Faster R-CNN inference
        score_thresh,
        nms_thresh,
        detections_per_img,
        # Mask
        mask_roi_pool=None,
        mask_head=None,
        mask_predictor=None,
        keypoint_roi_pool=None,
        keypoint_head=None,
        keypoint_predictor=None,
    ):
        super().__init__()

        self.box_similarity = box_ops.box_iou
        # assign ground-truth boxes for each proposal
        self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)

        self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)

        if bbox_reg_weights is None:
            bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
        self.box_coder = det_utils.BoxCoder(bbox_reg_weights)

        self.box_roi_pool = box_roi_pool
        self.box_head = box_head
        self.box_predictor = box_predictor

        self.score_thresh = score_thresh
        self.nms_thresh = nms_thresh
        self.detections_per_img = detections_per_img

        self.mask_roi_pool = mask_roi_pool
        self.mask_head = mask_head
        self.mask_predictor = mask_predictor

        self.keypoint_roi_pool = keypoint_roi_pool
        self.keypoint_head = keypoint_head
        self.keypoint_predictor = keypoint_predictor

    def has_mask(self):
        if self.mask_roi_pool is None:
            return False
        if self.mask_head is None:
            return False
        if self.mask_predictor is None:
            return False
        return True

    def has_keypoint(self):
        if self.keypoint_roi_pool is None:
            return False
        if self.keypoint_head is None:
            return False
        if self.keypoint_predictor is None:
            return False
        return True

    def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
        # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
        matched_idxs = []
        labels = []
        for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):

            if gt_boxes_in_image.numel() == 0:
                # Background image
                device = proposals_in_image.device
                clamped_matched_idxs_in_image = torch.zeros(
                    (proposals_in_image.shape[0],), dtype=torch.int64, device=device
                )
                labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
            else:
                #  set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
                match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)   # 计算proposal与每个gt_box的iou重合度
                matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)     # 为所有iou小于high_iou_threshold设置标志

                clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)      # 注意-1, -2对应的gt索引会调整到0

                labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]  # 获取proposal匹配到的gt对应标签
                labels_in_image = labels_in_image.to(dtype=torch.int64)

                # 将gt索引为-1的类别设置为0,即背景,负样本
                bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
                labels_in_image[bg_inds] = 0

                # 将gt索引为-2的类别设置为-1, 即废弃样本
                ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
                labels_in_image[ignore_inds] = -1  # -1 is ignored by sampler

            matched_idxs.append(clamped_matched_idxs_in_image)
            labels.append(labels_in_image)
        return matched_idxs, labels

    def subsample(self, labels):
        # type: (List[Tensor]) -> List[Tensor]
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
        sampled_inds = []
        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
            img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
            sampled_inds.append(img_sampled_inds)
        return sampled_inds

    def add_gt_proposals(self, proposals, gt_boxes):
        # type: (List[Tensor], List[Tensor]) -> List[Tensor]
        proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]

        return proposals

    def check_targets(self, targets):
        # type: (Optional[List[Dict[str, Tensor]]]) -> None
        assert targets is not None
        assert all(["boxes" in t for t in targets])
        assert all(["labels" in t for t in targets])
        if self.has_mask():
            assert all(["masks" in t for t in targets])

    def select_training_samples(
        self,
        proposals,  # type: List[Tensor]
        targets,  # type: Optional[List[Dict[str, Tensor]]]
    ):
        # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
        self.check_targets(targets)
        assert targets is not None
        dtype = proposals[0].dtype
        device = proposals[0].device

        gt_boxes = [t["boxes"].to(dtype) for t in targets]
        gt_labels = [t["labels"] for t in targets]

        # 将proposals、gt_box添加进来
        proposals = self.add_gt_proposals(proposals, gt_boxes)

        # 为每个proposal匹配对应的gt_box,并划分到正负样本中
        matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
        # 按给定数量和比例采样正负样本
        sampled_inds = self.subsample(labels)
        matched_gt_boxes = []
        num_images = len(proposals)
        for img_id in range(num_images):
            # 获取每张图像的正负样本索引
            img_sampled_inds = sampled_inds[img_id]
            # 获取对应正负样本的proposals信息
            proposals[img_id] = proposals[img_id][img_sampled_inds]
            # 获取对应正负样本的真实类别信息
            labels[img_id] = labels[img_id][img_sampled_inds]
            # 获取对应正负样本的gt索引信息
            matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]

            gt_boxes_in_image = gt_boxes[img_id]
            if gt_boxes_in_image.numel() == 0:
                gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
            # 获取对应正负样本的gt box信息
            matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
        # 根据gt和proposal计算边框回归参数(针对gt的)
        regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
        return proposals, matched_idxs, labels, regression_targets

    def postprocess_detections(
        self,
        class_logits,  # type: Tensor
        box_regression,  # type: Tensor
        proposals,  # type: List[Tensor]
        image_shapes,  # type: List[Tuple[int, int]]
    ):
        # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
        device = class_logits.device
        num_classes = class_logits.shape[-1]

        boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
        pred_boxes = self.box_coder.decode(box_regression, proposals)

        pred_scores = F.softmax(class_logits, -1)

        pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
        pred_scores_list = pred_scores.split(boxes_per_image, 0)

        all_boxes = []
        all_scores = []
        all_labels = []
        for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
            boxes = box_ops.clip_boxes_to_image(boxes, image_shape)

            # create labels for each prediction
            labels = torch.arange(num_classes, device=device)
            labels = labels.view(1, -1).expand_as(scores)

            # remove predictions with the background label
            boxes = boxes[:, 1:]
            scores = scores[:, 1:]
            labels = labels[:, 1:]

            # batch everything, by making every class prediction be a separate instance
            boxes = boxes.reshape(-1, 4)
            scores = scores.reshape(-1)
            labels = labels.reshape(-1)

            # remove low scoring boxes
            inds = torch.where(scores > self.score_thresh)[0]
            boxes, scores, labels = boxes[inds], scores[inds], labels[inds]

            # remove empty boxes
            keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]

            # non-maximum suppression, independently done per class
            keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
            # keep only topk scoring predictions
            keep = keep[: self.detections_per_img]
            boxes, scores, labels = boxes[keep], scores[keep], labels[keep]

            all_boxes.append(boxes)
            all_scores.append(scores)
            all_labels.append(labels)

        return all_boxes, all_scores, all_labels

    def forward(
        self,
        features,  # type: Dict[str, Tensor]
        proposals,  # type: List[Tensor]
        image_shapes,  # type: List[Tuple[int, int]]
        targets=None,  # type: Optional[List[Dict[str, Tensor]]]
    ):
        # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
        """
        Args:
            features (List[Tensor])
            proposals (List[Tensor[N, 4]])
            image_shapes (List[Tuple[H, W]])
            targets (List[Dict])
        """
        if targets is not None:
            for t in targets:
                # TODO: https://github.com/pytorch/pytorch/issues/26731
                floating_point_types = (torch.float, torch.double, torch.half)
                assert t["boxes"].dtype in floating_point_types, "target boxes must of float type"
                assert t["labels"].dtype == torch.int64, "target labels must of int64 type"
                if self.has_keypoint():
                    assert t["keypoints"].dtype == torch.float32, "target keypoints must of float type"

        if self.training:
            # 获取正负样本和回归参数
            proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)  
        else:
            labels = None
            regression_targets = None
            matched_idxs = None
        # 多尺度RoI Align获取建议框的特征
        box_features = self.box_roi_pool(features, proposals, image_shapes)
        # 特征经过2层全连接层,将[1024, 256, 7, 7]综合特征,转化为[1024, 1024]
        box_features = self.box_head(box_features)
        # 获取预测框的位置信息和分类分数
        class_logits, box_regression = self.box_predictor(box_features)

        result: List[Dict[str, torch.Tensor]] = []
        losses = {}
        if self.training:
            assert labels is not None and regression_targets is not None
            # 计算最终的分类损失与位置损失
            loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets) 
            losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
        else:
            # 计算最终的分类损失与位置损失
            boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)  
            num_images = len(boxes)
            for i in range(num_images):
                result.append(
                    {
                        "boxes": boxes[i],
                        "labels": labels[i],
                        "scores": scores[i],
                    }
                )

        if self.has_mask():
            mask_proposals = [p["boxes"] for p in result]
            if self.training:
                assert matched_idxs is not None
                # during training, only focus on positive boxes
                num_images = len(proposals)
                mask_proposals = []
                pos_matched_idxs = []
                for img_id in range(num_images):
                    pos = torch.where(labels[img_id] > 0)[0]
                    mask_proposals.append(proposals[img_id][pos])
                    pos_matched_idxs.append(matched_idxs[img_id][pos])
            else:
                pos_matched_idxs = None

            if self.mask_roi_pool is not None:
                mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
                mask_features = self.mask_head(mask_features)
                mask_logits = self.mask_predictor(mask_features)
            else:
                raise Exception("Expected mask_roi_pool to be not None")

            loss_mask = {}
            if self.training:
                assert targets is not None
                assert pos_matched_idxs is not None
                assert mask_logits is not None

                gt_masks = [t["masks"] for t in targets]
                gt_labels = [t["labels"] for t in targets]
                rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
                loss_mask = {"loss_mask": rcnn_loss_mask}
            else:
                labels = [r["labels"] for r in result]
                masks_probs = maskrcnn_inference(mask_logits, labels)
                for mask_prob, r in zip(masks_probs, result):
                    r["masks"] = mask_prob

            losses.update(loss_mask)

        # keep none checks in if conditional so torchscript will conditionally
        # compile each branch
        if (
            self.keypoint_roi_pool is not None
            and self.keypoint_head is not None
            and self.keypoint_predictor is not None
        ):
            keypoint_proposals = [p["boxes"] for p in result]
            if self.training:
                # during training, only focus on positive boxes
                num_images = len(proposals)
                keypoint_proposals = []
                pos_matched_idxs = []
                assert matched_idxs is not None
                for img_id in range(num_images):
                    pos = torch.where(labels[img_id] > 0)[0]
                    keypoint_proposals.append(proposals[img_id][pos])
                    pos_matched_idxs.append(matched_idxs[img_id][pos])
            else:
                pos_matched_idxs = None

            keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
            keypoint_features = self.keypoint_head(keypoint_features)
            keypoint_logits = self.keypoint_predictor(keypoint_features)

            loss_keypoint = {}
            if self.training:
                assert targets is not None
                assert pos_matched_idxs is not None

                gt_keypoints = [t["keypoints"] for t in targets]
                rcnn_loss_keypoint = keypointrcnn_loss(
                    keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
                )
                loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
            else:
                assert keypoint_logits is not None
                assert keypoint_proposals is not None

                keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
                for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
                    r["keypoints"] = keypoint_prob
                    r["keypoints_scores"] = kps

            losses.update(loss_keypoint)

        return result, losses