Faster RCNN
前几篇写了yolo系列的模型训练以及模型优化,加速,这篇是关于Faster RCNN。Faster RCNN所用的数据集是来自yolo v3,基本可以无缝使用。数据集参考:https://blog.csdn.net/kui9702/article/details/122954209
本节代码 GitHub:https://github.com/kile97/faster-rcnn-pytorch 或者 https://gitee.com/kui9702_kile/faster-rcnn-pytorch-master
Faster RCNN的网络结构主要分为3部分:
backbone 骨干网络
模型的主体网络,提取特征
RegionProposalNetwork 区域建议网络
将backbone提取的特征转化为特定的分类信息与坐标信息,并将特定的信息与原图进行映射,利用nms保留所需的建议框输出给RoIHead
RoIHead 兴趣区域网络
对建议框进行RoI Pooling,获得最终的分类信息和坐标(位置)信息
Faster RCNN 整体结构
# 配置anchor_sizes 这里选用的是我在之前的yolo v3中kmeans聚类的一些数据,这里会生成len(anchor_sizes) * len(aspect_ratios)个anchor box
anchor_sizes = ((25,35,50,90,120),)
# anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
aspect_ratios = ((0.5, 1.0, 2.0),)
# 定义rpn的anchor box生成类
rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
# 定义Fasterrcnn 模型
model = FasterRCNN(backbone, num_classes, rpn_anchor_generator=rpn_anchor_generator)
Backbone
backbone 选用Resnet50,这里使用_resnet_fpn_extractor函数对Resnet50进行公用特征层的分离,从这部分网络获取
# 获取resnet50_backbone, 可以选自torchvision的resnet50预训练模型或者faster rcnn预训练模型
backbone = resnet50(pretrained=pretrained_backbone, progress=True, norm_layer=FrozenBatchNorm2d)
# 用_resnet_fpn_extractor获取Faster的骨干网络,并且获得backbone需要保存的特征层为backbone的[1,2,3,4]
backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
class BackboneWithFPN(nn.Module):
"""
特征提取:由backbone和FPN组成
"""
def __init__(
self,
backbone: nn.Module,
return_layers: Dict[str, str],
in_channels_list: List[int],
out_channels: int,
extra_blocks: Optional[ExtraFPNBlock] = None,
) -> None:
super().__init__()
if extra_blocks is None:
extra_blocks = LastLevelMaxPool()
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
self.fpn = FeaturePyramidNetwork(
in_channels_list=in_channels_list,
out_channels=out_channels,
extra_blocks=extra_blocks,
)
self.out_channels = out_channels
def forward(self, x: Tensor) -> Dict[str, Tensor]:
# 这里获得4组feature_map
x = self.body(x)
# 将特征做特征金字塔,得到更好、更细的features
x = self.fpn(x)
return x
class IntermediateLayerGetter(nn.ModuleDict):
"""
Module wrapper that returns intermediate layers from a model
urn_layers
return_layers = {str(k): str(v) for k, v in return_layers.items()}
layers = OrderedDict()
for name, module in model.named_children():
layers[name] = module
if name in return_layers:
del return_layers[name]
if not return_layers:
break
super().__init__(layers)
self.return_layers = orig_return_layers
def forward(self, x):
# 定义输出为一个有序的字典
out = OrderedDict()
# 遍历模型每个层,这里指conv1、layer1、layer2、layer3等,具体可以从resnet的模型结构图查看,获取backbone.return_layers={'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
# 的特征
for name, module in self.items():
x = module(x)
if name in self.return_layers:
out_name = self.return_layers[name]
out[out_name] = x
return out
class FeaturePyramidNetwork(nn.Module):
"""
Module that adds a FPN from on top of a set of feature maps. This is based on
`"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
The feature maps are currently supposed to be in increasing depth
order.
The input to the model is expected to be an OrderedDict[Tensor], containing
the feature maps on top of which the FPN will be added.
Args:
in_channels_list (list[int]): number of channels for each feature map that
is passed to the module
out_channels (int): number of channels of the FPN representation
extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
be performed. It is expected to take the fpn features, the original
features and the names of the original features as input, and returns
a new list of feature maps and their corresponding names
Examples::
>>> m = torchvision.ops.FeaturePyramidNetwork([10, 20, 30], 5)
>>> # get some dummy data
>>> x = OrderedDict()
>>> x['feat0'] = torch.rand(1, 10, 64, 64)
>>> x['feat2'] = torch.rand(1, 20, 16, 16)
>>> x['feat3'] = torch.rand(1, 30, 8, 8)
>>> # compute the FPN on top of x
>>> output = m(x)
>>> print([(k, v.shape) for k, v in output.items()])
>>> # returns
>>> [('feat0', torch.Size([1, 5, 64, 64])),
>>> ('feat2', torch.Size([1, 5, 16, 16])),
>>> ('feat3', torch.Size([1, 5, 8, 8]))]
"""
def __init__(
self,
in_channels_list: List[int],
out_channels: int,
extra_blocks: Optional[ExtraFPNBlock] = None,
):
super().__init__()
_log_api_usage_once(self)
self.inner_blocks = nn.ModuleList()
self.layer_blocks = nn.ModuleList()
for in_channels in in_channels_list:
if in_channels == 0:
raise ValueError("in_channels=0 is currently not supported")
inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
self.inner_blocks.append(inner_block_module)
self.layer_blocks.append(layer_block_module)
# initialize parameters now to avoid modifying the initialization of top_blocks
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_uniform_(m.weight, a=1)
nn.init.constant_(m.bias, 0)
if extra_blocks is not None:
assert isinstance(extra_blocks, ExtraFPNBlock)
self.extra_blocks = extra_blocks
def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
"""
This is equivalent to self.inner_blocks[idx](x),
but torchscript doesn't support this yet
"""
num_blocks = len(self.inner_blocks)
if idx < 0:
idx += num_blocks
out = x
for i, module in enumerate(self.inner_blocks):
if i == idx:
out = module(x)
return out
def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
"""
This is equivalent to self.layer_blocks[idx](x),
but torchscript doesn't support this yet
"""
num_blocks = len(self.layer_blocks)
if idx < 0:
idx += num_blocks
out = x
for i, module in enumerate(self.layer_blocks):
if i == idx:
out = module(x)
return out
def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""
Computes the FPN for a set of feature maps.
Args:
x (OrderedDict[Tensor]): feature maps for each feature level.
Returns:
results (OrderedDict[Tensor]): feature maps after FPN layers.
They are ordered from highest resolution first.
"""
# 获取4个特征层名
names = list(x.keys())
# 获取4组feature_map
x = list(x.values())
# 获取最后一层的feature_map
# 相当于 last_inner = self.inner_blocks[-1](x)
last_inner = self.get_result_from_inner_blocks(x[-1], -1)
results = []
# 相当于results.append(self.layer_blocks[-1](last_inner))
results.append(self.get_result_from_layer_blocks(last_inner, -1))
# 将其余的特征层做FPN(特征金字塔)提取特征
# 特征金字塔的操作是将顶层特征进行采样上采样与底层特征进行融合(相加)
for idx in range(len(x) - 2, -1, -1):
inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
feat_shape = inner_lateral.shape[-2:]
# 对last_inner进行最近邻插值算法上采样操作
inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
# 对inner_lateral和inner_top_down进行相加
last_inner = inner_lateral + inner_top_down
# 将特征保存起来
results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
# extra_blocks为自己设置,如果未设置则为最大池化,则在原来的基础上增加池化层
if self.extra_blocks is not None:
results, names = self.extra_blocks(results, x, names)
# make it back an OrderedDict
out = OrderedDict([(k, v) for k, v in zip(names, results)])
# 这里输出的结果shape为{'0':[2,256,200,240],'1':[2,256,100,120],'2':[2,256,50,60],'3':[2,256,25,30],'pool':[2,256,13,15]}
return out
Faster RCNN是继承GeneralizedRCNN类,并实现GeneralizedRCNN的forward函数
class GeneralizedRCNN(nn.Module):
"""
Faster RCNN的实现类
"""
def __init__(self, backbone: nn.Module, rpn: nn.Module, roi_heads: nn.Module, transform: nn.Module) -> None:
super().__init__()
# 记录torchvsion、torch的api使用情况
_log_api_usage_once(self)
# 将变量赋值给类变量
self.transform = transform
self.backbone = backbone
self.rpn = rpn
self.roi_heads = roi_heads
# used only on torchscript mode
self._has_warned = False
@torch.jit.unused
def eager_outputs(self, losses, detections):
# type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Union[Dict[str, Tensor], List[Dict[str, Tensor]]]
if self.training:
return losses
return detections
def forward(self, images, targets=None):
# type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
"""
Args:
images (list[Tensor]): images to be processed
targets (list[Dict[str, Tensor]]): ground-truth boxes present in the image (optional)
Returns:
result (list[BoxList] or dict[Tensor]): the output from the model.
During training, it returns a dict[Tensor] which contains the losses.
During testing, it returns list[BoxList] contains additional fields
like `scores`, `labels` and `mask` (for Mask R-CNN models).
"""
if self.training and targets is None:
raise ValueError("In training mode, targets should be passed")
if self.training:
assert targets is not None
for target in targets:
# 获取boxes的标签
boxes = target["boxes"]
if isinstance(boxes, torch.Tensor):
if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
raise ValueError(f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.")
else:
raise ValueError(f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
original_image_sizes: List[Tuple[int, int]] = []
#遍历图片
for img in images:
# img.shape为[channels, height, weigth]
val = img.shape[-2:]
assert len(val) == 2
# 保存图片的原始大小
original_image_sizes.append((val[0], val[1]))
# 对图片进行预处理 将images从Tensor()转换成ImageLists,储存Tensor和原始图片的size
images, targets = self.transform(images, targets)
# Check for degenerate boxes
# TODO: Move this to a function
if targets is not None:
for target_idx, target in enumerate(targets):
# 对图片进行判别,判断图片的标注是否存在客观问题,比如数值越界
boxes = target["boxes"]
degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
if degenerate_boxes.any():
# print the first degenerate box
bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
degen_bb: List[float] = boxes[bb_idx].tolist()
raise ValueError(
"All bounding boxes should have positive height and width."
f" Found invalid box {degen_bb} for target at index {target_idx}."
)
# 提取图片的特征
features = self.backbone(images.tensors)
if isinstance(features, torch.Tensor):
features = OrderedDict([("0", features)])
# 将图片(Tensor)、backbone提取的特征、标签送入到区域建议网络(rpn),如果是训练模式,获取区域建议框和对应的分类loss、定位loss
proposals, proposal_losses = self.rpn(images, features, targets)
# 将特征、区域建议框、原始图片尺寸、标签送入兴趣区域网络(RoI Head),如果是训练模式的话,获取分类损失和定位损失并且detections=[],如果为验证模式(model.eval()),则获取预测框的定位信息以及预测框所对应的分类信息,detector_losses=[]
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
# 如果是训练模式,detections=[];如果是验证模式,获取预测框信息
detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) # type: ignore[operator]
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
if torch.jit.is_scripting():
if not self._has_warned:
warnings.warn("RCNN always returns a (Losses, Detections) tuple in scripting")
self._has_warned = True
return losses, detections
else:
return self.eager_outputs(losses, detections)
RegionProposalNetwork 区域建议网络
筛选出符合要求、符合条件的候选框
# 自定义rpn类
rpn = RegionProposalNetwork(
rpn_anchor_generator, # anchor box生成器,下面介绍
rpn_head, # 简单的分类和回归分类器,下面会介绍
rpn_fg_iou_thresh, # 物体的iou阈值
rpn_bg_iou_thresh, # 背景的iou阈值
rpn_batch_size_per_image, # 每个batch需要对比的图片数量
rpn_positive_fraction, # 每个batch图片中正负样本的比例,比如rpn_batch_size_per_image为256,rpn_positive_fraction为0.5,则正负样本分别为128
rpn_pre_nms_top_n, # 训练前 nms需要保留的候选框数量
rpn_post_nms_top_n, # 训练后 nms需要保留的候选框数量
rpn_nms_thresh, # nms的iou阈值
score_thresh=rpn_score_thresh, # 分数阈值,用来过滤预测分数过低的候选框
)
class RegionProposalNetwork(torch.nn.Module):
'''
RPN网络类
'''
__annotations__ = {
"box_coder": det_utils.BoxCoder,
"proposal_matcher": det_utils.Matcher,
"fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
}
def __init__(
self,
anchor_generator: AnchorGenerator,
head: nn.Module,
# Faster-RCNN Training
fg_iou_thresh: float,
bg_iou_thresh: float,
batch_size_per_image: int,
positive_fraction: float,
# Faster-RCNN Inference
pre_nms_top_n: Dict[str, int],
post_nms_top_n: Dict[str, int],
nms_thresh: float,
score_thresh: float = 0.0,
) -> None:
super().__init__()
self.anchor_generator = anchor_generator
self.head = head
self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
# used during training
self.box_similarity = box_ops.box_iou
self.proposal_matcher = det_utils.Matcher(
fg_iou_thresh,
bg_iou_thresh,
allow_low_quality_matches=True,
)
self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
# used during testing
self._pre_nms_top_n = pre_nms_top_n
self._post_nms_top_n = post_nms_top_n
self.nms_thresh = nms_thresh
self.score_thresh = score_thresh
self.min_size = 1e-3
def pre_nms_top_n(self) -> int:
if self.training:
return self._pre_nms_top_n["training"]
return self._pre_nms_top_n["testing"]
def post_nms_top_n(self) -> int:
if self.training:
return self._post_nms_top_n["training"]
return self._post_nms_top_n["testing"]
def assign_targets_to_anchors(
self, anchors: List[Tensor], targets: List[Dict[str, Tensor]]
) -> Tuple[List[Tensor], List[Tensor]]:
labels = []
matched_gt_boxes = []
for anchors_per_image, targets_per_image in zip(anchors, targets):
gt_boxes = targets_per_image["boxes"]
if gt_boxes.numel() == 0:
# Background image (negative example)
device = anchors_per_image.device
matched_gt_boxes_per_image = torch.zeros(anchors_per_image.shape, dtype=torch.float32, device=device)
labels_per_image = torch.zeros((anchors_per_image.shape[0],), dtype=torch.float32, device=device)
else:
match_quality_matrix = self.box_similarity(gt_boxes, anchors_per_image) # 计算anchor box与真实框之间的iou
matched_idxs = self.proposal_matcher(match_quality_matrix) #计算每个anchors与gt匹配iou最大的索引(如果iou<0.3索引置为-1,0.3<iou<0.7索引为-2) 这个iou由定义时设置
# 这里使用clamp设置下限0是为了方便取每个anchors对应的gt_boxes信息
# 负样本和舍弃的样本都是负值,所以为了防止越界直接置为0
# 因为后面是通过labels_per_image变量来记录正样本位置的,
# 所以负样本和舍弃的样本对应的gt_boxes信息并没有什么意义,
# 反正计算目标边界框回归损失时只会用到正样本。
matched_gt_boxes_per_image = gt_boxes[matched_idxs.clamp(min=0)]
# 记录所有anchors匹配后的标签(正样本处标记为1,负样本处标记为0,丢弃样本处标记为 - 2)
labels_per_image = matched_idxs >= 0
labels_per_image = labels_per_image.to(dtype=torch.float32)
# labels_per_image的背景的设置为0,之后作为负样本进行训练
bg_indices = matched_idxs == self.proposal_matcher.BELOW_LOW_THRESHOLD
labels_per_image[bg_indices] = 0.0
# 将背景与阈值之间的值设置为-1.0 之后丢弃 不做训练
inds_to_discard = matched_idxs == self.proposal_matcher.BETWEEN_THRESHOLDS
labels_per_image[inds_to_discard] = -1.0
labels.append(labels_per_image)
matched_gt_boxes.append(matched_gt_boxes_per_image)
return labels, matched_gt_boxes
def _get_top_n_idx(self, objectness: Tensor, num_anchors_per_level: List[int]) -> Tensor:
r = []
offset = 0
for ob in objectness.split(num_anchors_per_level, 1):
if torchvision._is_tracing():
num_anchors, pre_nms_top_n = _onnx_get_num_anchors_and_pre_nms_top_n(ob, self.pre_nms_top_n())
else:
num_anchors = ob.shape[1]
pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors)
_, top_n_idx = ob.topk(pre_nms_top_n, dim=1)
r.append(top_n_idx + offset)
offset += num_anchors
return torch.cat(r, dim=1)
def filter_proposals(
self,
proposals: Tensor,
objectness: Tensor,
image_shapes: List[Tuple[int, int]],
num_anchors_per_level: List[int],
) -> Tuple[List[Tensor], List[Tensor]]:
num_images = proposals.shape[0] # 获得图片数量
device = proposals.device
# do not backprop through objectness
objectness = objectness.detach()
objectness = objectness.reshape(num_images, -1) # 将anchor box的分类信息分配给num_images张图片 shape从
levels = [ # levels负责记录分隔不同预测特征层上的anchors索引信息
torch.full((n,), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level)
]
levels = torch.cat(levels, 0)
levels = levels.reshape(1, -1).expand_as(objectness)
# select top_n boxes independently per level before applying nms
top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level) # 获取符合条件的anchor box,留下的anchor box为len(anchor_size) * pre_nms_num
image_range = torch.arange(num_images, device=device)
batch_idx = image_range[:, None]
objectness = objectness[batch_idx, top_n_idx] # 获取每个预测特征层预测概率排前pre_nms_top_n的anchors索引值获取相应概率信息
levels = levels[batch_idx, top_n_idx]
proposals = proposals[batch_idx, top_n_idx] # 获取概率排前pre_nms_top_n的anchors索引值获取相应bbox位置信息
objectness_prob = torch.sigmoid(objectness) # 获取pre_nms_top_n中每个anchor的分类值
final_boxes = []
final_scores = []
for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
boxes = box_ops.clip_boxes_to_image(boxes, img_shape) # 调整anchor box的实际信息,防止越界
# remove small boxes
keep = box_ops.remove_small_boxes(boxes, self.min_size) # 移除掉不符合我们要求的小框
boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
# remove low scoring boxes
# use >= for Backwards compatibility
keep = torch.where(scores >= self.score_thresh)[0] # 去除一些分值较低的anchor box
boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
# non-maximum suppression, independently done per level
keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh) # 去除重复叠加在一起的框 nms
# keep only topk scoring predictions
keep = keep[: self.post_nms_top_n()] # 留下self.post_nms_top_n()个anchor box 用于训练
boxes, scores = boxes[keep], scores[keep]
final_boxes.append(boxes)
final_scores.append(scores)
return final_boxes, final_scores
def compute_loss(
self, objectness: Tensor, pred_bbox_deltas: Tensor, labels: List[Tensor], regression_targets: List[Tensor]
) -> Tuple[Tensor, Tensor]:
"""
Args:
objectness (Tensor)
pred_bbox_deltas (Tensor)
labels (List[Tensor])
regression_targets (List[Tensor])
Returns:
objectness_loss (Tensor)
box_loss (Tensor)
"""
# 按照我们自定义的数值 选择合适的正负样本
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
# 将batch里面的样本id拼接起来,并且获得正样本的索引
sampled_pos_inds = torch.where(torch.cat(sampled_pos_inds, dim=0))[0]
# 将batch里面的样本id拼接起来,并且获得负样本的索引
sampled_neg_inds = torch.where(torch.cat(sampled_neg_inds, dim=0))[0]
# 将正负样本的id拼接起来
sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
# 将objectness展平为标量 原来是[n, 1]变为[n]
objectness = objectness.flatten()
# 将label叠加起来 并且展平为标量
labels = torch.cat(labels, dim=0)
regression_targets = torch.cat(regression_targets, dim=0)
# 计算box损失
box_loss = (
F.smooth_l1_loss(
pred_bbox_deltas[sampled_pos_inds],
regression_targets[sampled_pos_inds],
beta=1 / 9,
reduction="sum",
)
/ (sampled_inds.numel())
)
# 计算二分类损失
objectness_loss = F.binary_cross_entropy_with_logits(objectness[sampled_inds], labels[sampled_inds])
return objectness_loss, box_loss
def forward(
self,
images: ImageList,
features: Dict[str, Tensor],
targets: Optional[List[Dict[str, Tensor]]] = None,
) -> Tuple[List[Tensor], Dict[str, Tensor]]:
"""
Args:
images (ImageList): images for which we want to compute the predictions
features (Dict[str, Tensor]): features computed from the images that are
used for computing the predictions. Each tensor in the list
correspond to different feature levels
targets (List[Dict[str, Tensor]]): ground-truth boxes present in the image (optional).
If provided, each element in the dict should contain a field `boxes`,
with the locations of the ground-truth boxes.
Returns:
boxes (List[Tensor]): the predicted boxes from the RPN, one Tensor per
image.
losses (Dict[str, Tensor]): the losses for the model during training. During
testing, it is an empty dict.
"""
# 将特征转化为List
features = list(features.values())
# 获取2分类信息和预测框信息
objectness, pred_bbox_deltas = self.head(features)
# 获取自定义的anchor box
anchors = self.anchor_generator(images, features)
num_images = len(anchors)
# 获取每个特征层的shape
num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
# 获取每个特征层的像素点数量
num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
# 获取每个anchor box的分类信息和位置信息
objectness, pred_bbox_deltas = concat_box_prediction_layers(objectness, pred_bbox_deltas)
# 将pred_bbox_delta应用于锚,以获得解码的提案注意,我们分离了delta,因为更快的R-CNN不会支持提案
proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors) # 将预测的feature_map解码为与自定义anchor box相同格式的预测框信息
proposals = proposals.view(num_images, -1, 4) # 将proposals分配给batch的每张图片
boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level) # 获取训练的anchor box
losses = {}
if self.training:
assert targets is not None
labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets) # 计算每个anchors最匹配的gt,并将anchors进行分类,前景,背景以及废弃的anchors
regression_targets = self.box_coder.encode(matched_gt_boxes, anchors) # 获取真实的坐标(gt box)回归参数
loss_objectness, loss_rpn_box_reg = self.compute_loss( # 计算损失
objectness, pred_bbox_deltas, labels, regression_targets
)
losses = {
"loss_objectness": loss_objectness,
"loss_rpn_box_reg": loss_rpn_box_reg,
}
return boxes, losses
class RPNHead(nn.Module):
"""
Adds a simple RPN Head with classification and regression heads
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
"""
def __init__(self, in_channels: int, num_anchors: int) -> None:
super().__init__()
self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
for layer in self.children():
torch.nn.init.normal_(layer.weight, std=0.01) # type: ignore[arg-type]
torch.nn.init.constant_(layer.bias, 0) # type: ignore[arg-type]
def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
logits = []
bbox_reg = []
# 获取每一层的分类信息(只做背景与非背景的分类)和位置信息
for feature in x:
t = F.relu(self.conv(feature))
logits.append(self.cls_logits(t))
bbox_reg.append(self.bbox_pred(t))
return logits, bbox_reg
def concat_box_prediction_layers(box_cls: List[Tensor], box_regression: List[Tensor]) -> Tuple[Tensor, Tensor]:
box_cls_flattened = []
box_regression_flattened = []
# 对于每个要素级别,排列输出,使其与标签的格式相同。请注意,标签是为所有连接的要素级别计算的,因此我们对对象性和box_回归保持相同的表示
for box_cls_per_level, box_regression_per_level in zip(box_cls, box_regression):
#获取数量、channels、高和宽
N, AxC, H, W = box_cls_per_level.shape
Ax4 = box_regression_per_level.shape[1]
# 每一个像素点anchor box的数量
A = Ax4 // 4
# 种类数num_class 这里只有前景和背景
C = AxC // A
# 将[N, AxC, H, W]转化为[N, -1, 1],获取每一个候选框(anchor box)的分类信息(前景与后景的分类概率)
box_cls_per_level = permute_and_flatten(box_cls_per_level, N, A, C, H, W)
box_cls_flattened.append(box_cls_per_level)
# 将[N, AxC, H, W]转化为[N, -1, 4],获取每一个候选框的位置信息
box_regression_per_level = permute_and_flatten(box_regression_per_level, N, A, 4, H, W)
box_regression_flattened.append(box_regression_per_level)
# 获取每一个anchor box的分类信息(分数)
box_cls = torch.cat(box_cls_flattened, dim=1).flatten(0, -2)
# 获取每一个anchor box的位置信息
box_regression = torch.cat(box_regression_flattened, dim=1).reshape(-1, 4)
return box_cls, box_regression
class BoxCoder:
"""
This class encodes and decodes a set of bounding boxes into
the representation used for training the regressors.
"""
def __init__(
self, weights: Tuple[float, float, float, float], bbox_xform_clip: float = math.log(1000.0 / 16)
) -> None:
"""
Args:
weights (4-element tuple)
bbox_xform_clip (float)
"""
self.weights = weights
self.bbox_xform_clip = bbox_xform_clip
def encode(self, reference_boxes: List[Tensor], proposals: List[Tensor]) -> List[Tensor]:
# 获取batch里面每张图片的预测框个数
boxes_per_image = [len(b) for b in reference_boxes]
# 将预测框拼接起来
reference_boxes = torch.cat(reference_boxes, dim=0)
# 将自定义的anchor box也进行拼接
proposals = torch.cat(proposals, dim=0)
# 编码过程 将预测框与我们自定义的anchor box进行编码 获取与feature_map输出的结果成同一量纲
targets = self.encode_single(reference_boxes, proposals)
return targets.split(boxes_per_image, 0)
def encode_single(self, reference_boxes: Tensor, proposals: Tensor) -> Tensor:
"""
Encode a set of proposals with respect to some
reference boxes
Args:
reference_boxes (Tensor): reference boxes
proposals (Tensor): boxes to be encoded
"""
dtype = reference_boxes.dtype
device = reference_boxes.device
weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
targets = encode_boxes(reference_boxes, proposals, weights)
return targets
def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor:
assert isinstance(boxes, (list, tuple))
assert isinstance(rel_codes, torch.Tensor)
# 获得自定义anchor box的数量
boxes_per_image = [b.size(0) for b in boxes]
concat_boxes = torch.cat(boxes, dim=0) # 将batch里面的所有anchor box叠加起来比如batch([185460, 4],[185460, 4])变为([370920, 4])
box_sum = 0
for val in boxes_per_image:
box_sum += val
if box_sum > 0:
rel_codes = rel_codes.reshape(box_sum, -1) # [370920, 4]
pred_boxes = self.decode_single(rel_codes, concat_boxes) # 获取所有anchor box实际的预测框位置信息
if box_sum > 0:
pred_boxes = pred_boxes.reshape(box_sum, -1, 4) # 将[n, 4]reshape为[n, -1, 4]
return pred_boxes
def decode_single(self, rel_codes: Tensor, boxes: Tensor) -> Tensor:
"""
From a set of original boxes and encoded relative box offsets,
get the decoded boxes.
Args:
rel_codes (Tensor): encoded boxes
boxes (Tensor): reference boxes.
"""
boxes = boxes.to(rel_codes.dtype)
widths = boxes[:, 2] - boxes[:, 0] # 获取自定义anchor box的宽度
heights = boxes[:, 3] - boxes[:, 1] # 获取自定义anchor box的高度
ctr_x = boxes[:, 0] + 0.5 * widths # 获取自定义anchor box的xmin,并且转移到自定义anchor box中间
ctr_y = boxes[:, 1] + 0.5 * heights # 获取自定义anchor box的ymin,并且转移到自定义anchor box中间
wx, wy, ww, wh = self.weights # 预测框的回归参数 rel_codes为[n, 4]
dx = rel_codes[:, 0::4] / wx # 获取rel_codes中所有的x坐标,每个x坐标为一个维度,shape为[n, 1],下面的一样
dy = rel_codes[:, 1::4] / wy # 获取rel_codes中所有的y坐标
dw = rel_codes[:, 2::4] / ww # 获取rel_codes中所有的宽度
dh = rel_codes[:, 3::4] / wh # 获取rel_codes中所有的高度
# 避免dw, dh超过最大值
dw = torch.clamp(dw, max=self.bbox_xform_clip)
dh = torch.clamp(dh, max=self.bbox_xform_clip)
pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] # 获取预测框的x坐标(中心坐标)
pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] # 获取预测框的y坐标(中心坐标)
pred_w = torch.exp(dw) * widths[:, None] # 获取预测框的宽度
pred_h = torch.exp(dh) * heights[:, None] # 获取预测框的高度
# Distance from center to box's corner.
c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h # 获取预测框的高度的一半
c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w # 获取预测框的宽度的一半
pred_boxes1 = pred_ctr_x - c_to_c_w # 获取预测框的xmin
pred_boxes2 = pred_ctr_y - c_to_c_h # 获取预测框的ymin
pred_boxes3 = pred_ctr_x + c_to_c_w # 获取预测框的xmax
pred_boxes4 = pred_ctr_y + c_to_c_h # 预测预测框的ymax
pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1) # 获得每组的左上角坐标和左下角下标
return pred_boxes
@torch.jit._script_if_tracing
def encode_boxes(reference_boxes: Tensor, proposals: Tensor, weights: Tensor) -> Tensor:
"""
Encode a set of proposals with respect to some
reference boxes
Args:
reference_boxes (Tensor): reference boxes
proposals (Tensor): boxes to be encoded
weights (Tensor[4]): the weights for ``(x, y, w, h)``
"""
# 获取之前设置的预测框的回归参数
wx = weights[0]
wy = weights[1]
ww = weights[2]
wh = weights[3]
# 获取自定义的anchor box的xmin,ymin,xmax,ymax
proposals_x1 = proposals[:, 0].unsqueeze(1)
proposals_y1 = proposals[:, 1].unsqueeze(1)
proposals_x2 = proposals[:, 2].unsqueeze(1)
proposals_y2 = proposals[:, 3].unsqueeze(1)
# 获取预测框的xmin,ymin,xmax,ymax
reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
# 获取自定义的anchor box的宽度、高度
ex_widths = proposals_x2 - proposals_x1
ex_heights = proposals_y2 - proposals_y1
# 获取自定义的anchor box的中心点x,y
ex_ctr_x = proposals_x1 + 0.5 * ex_widths
ex_ctr_y = proposals_y1 + 0.5 * ex_heights
# 获取预测框的宽度、高度
gt_widths = reference_boxes_x2 - reference_boxes_x1
gt_heights = reference_boxes_y2 - reference_boxes_y1
# 获取预测框的中心点x,y
gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
# 归一到与模型输出的feature_map相同量纲 这是编码过程 与 解码过程是一一对应的,可以看BoxCoder.decode_single()
targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = ww * torch.log(gt_widths / ex_widths)
targets_dh = wh * torch.log(gt_heights / ex_heights)
# 将targets_dx, targets_dy, targets_dw, targets_dh拼接起来
targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
return targets
将7x7的输入转化为1024的特征向量
class TwoMLPHead(nn.Module):
"""
Standard heads for FPN-based models
Args:
in_channels (int): number of input channels
representation_size (int): size of the intermediate representation
"""
def __init__(self, in_channels, representation_size):
super().__init__()
self.fc6 = nn.Linear(in_channels, representation_size)
self.fc7 = nn.Linear(representation_size, representation_size)
def forward(self, x):
x = x.flatten(start_dim=1)
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
return x
Faster RCNN的分类层与bounding box预测回归层
class FastRCNNPredictor(nn.Module):
"""
Standard classification + bounding box regression layers
for Fast R-CNN.
Args:
in_channels (int): number of input channels
num_classes (int): number of output classes (including background)
"""
def __init__(self, in_channels, num_classes):
super().__init__()
self.cls_score = nn.Linear(in_channels, num_classes)
self.bbox_pred = nn.Linear(in_channels, num_classes * 4)
def forward(self, x):
if x.dim() == 4:
assert list(x.shape[2:]) == [1, 1]
x = x.flatten(start_dim=1)
scores = self.cls_score(x)
bbox_deltas = self.bbox_pred(x)
return scores, bbox_deltas
class RoIHeads(nn.Module):
__annotations__ = {
"box_coder": det_utils.BoxCoder,
"proposal_matcher": det_utils.Matcher,
"fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
}
def __init__(
self,
box_roi_pool,
box_head,
box_predictor,
# Faster R-CNN training
fg_iou_thresh,
bg_iou_thresh,
batch_size_per_image,
positive_fraction,
bbox_reg_weights,
# Faster R-CNN inference
score_thresh,
nms_thresh,
detections_per_img,
# Mask
mask_roi_pool=None,
mask_head=None,
mask_predictor=None,
keypoint_roi_pool=None,
keypoint_head=None,
keypoint_predictor=None,
):
super().__init__()
self.box_similarity = box_ops.box_iou
# assign ground-truth boxes for each proposal
self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
if bbox_reg_weights is None:
bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
self.box_roi_pool = box_roi_pool
self.box_head = box_head
self.box_predictor = box_predictor
self.score_thresh = score_thresh
self.nms_thresh = nms_thresh
self.detections_per_img = detections_per_img
self.mask_roi_pool = mask_roi_pool
self.mask_head = mask_head
self.mask_predictor = mask_predictor
self.keypoint_roi_pool = keypoint_roi_pool
self.keypoint_head = keypoint_head
self.keypoint_predictor = keypoint_predictor
def has_mask(self):
if self.mask_roi_pool is None:
return False
if self.mask_head is None:
return False
if self.mask_predictor is None:
return False
return True
def has_keypoint(self):
if self.keypoint_roi_pool is None:
return False
if self.keypoint_head is None:
return False
if self.keypoint_predictor is None:
return False
return True
def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
# type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
matched_idxs = []
labels = []
for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
if gt_boxes_in_image.numel() == 0:
# Background image
device = proposals_in_image.device
clamped_matched_idxs_in_image = torch.zeros(
(proposals_in_image.shape[0],), dtype=torch.int64, device=device
)
labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
else:
# set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image) # 计算proposal与每个gt_box的iou重合度
matched_idxs_in_image = self.proposal_matcher(match_quality_matrix) # 为所有iou小于high_iou_threshold设置标志
clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0) # 注意-1, -2对应的gt索引会调整到0
labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image] # 获取proposal匹配到的gt对应标签
labels_in_image = labels_in_image.to(dtype=torch.int64)
# 将gt索引为-1的类别设置为0,即背景,负样本
bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
labels_in_image[bg_inds] = 0
# 将gt索引为-2的类别设置为-1, 即废弃样本
ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
matched_idxs.append(clamped_matched_idxs_in_image)
labels.append(labels_in_image)
return matched_idxs, labels
def subsample(self, labels):
# type: (List[Tensor]) -> List[Tensor]
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
sampled_inds = []
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
sampled_inds.append(img_sampled_inds)
return sampled_inds
def add_gt_proposals(self, proposals, gt_boxes):
# type: (List[Tensor], List[Tensor]) -> List[Tensor]
proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
return proposals
def check_targets(self, targets):
# type: (Optional[List[Dict[str, Tensor]]]) -> None
assert targets is not None
assert all(["boxes" in t for t in targets])
assert all(["labels" in t for t in targets])
if self.has_mask():
assert all(["masks" in t for t in targets])
def select_training_samples(
self,
proposals, # type: List[Tensor]
targets, # type: Optional[List[Dict[str, Tensor]]]
):
# type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
self.check_targets(targets)
assert targets is not None
dtype = proposals[0].dtype
device = proposals[0].device
gt_boxes = [t["boxes"].to(dtype) for t in targets]
gt_labels = [t["labels"] for t in targets]
# 将proposals、gt_box添加进来
proposals = self.add_gt_proposals(proposals, gt_boxes)
# 为每个proposal匹配对应的gt_box,并划分到正负样本中
matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
# 按给定数量和比例采样正负样本
sampled_inds = self.subsample(labels)
matched_gt_boxes = []
num_images = len(proposals)
for img_id in range(num_images):
# 获取每张图像的正负样本索引
img_sampled_inds = sampled_inds[img_id]
# 获取对应正负样本的proposals信息
proposals[img_id] = proposals[img_id][img_sampled_inds]
# 获取对应正负样本的真实类别信息
labels[img_id] = labels[img_id][img_sampled_inds]
# 获取对应正负样本的gt索引信息
matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
gt_boxes_in_image = gt_boxes[img_id]
if gt_boxes_in_image.numel() == 0:
gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
# 获取对应正负样本的gt box信息
matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
# 根据gt和proposal计算边框回归参数(针对gt的)
regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
return proposals, matched_idxs, labels, regression_targets
def postprocess_detections(
self,
class_logits, # type: Tensor
box_regression, # type: Tensor
proposals, # type: List[Tensor]
image_shapes, # type: List[Tuple[int, int]]
):
# type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
device = class_logits.device
num_classes = class_logits.shape[-1]
boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
pred_boxes = self.box_coder.decode(box_regression, proposals)
pred_scores = F.softmax(class_logits, -1)
pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
pred_scores_list = pred_scores.split(boxes_per_image, 0)
all_boxes = []
all_scores = []
all_labels = []
for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
# create labels for each prediction
labels = torch.arange(num_classes, device=device)
labels = labels.view(1, -1).expand_as(scores)
# remove predictions with the background label
boxes = boxes[:, 1:]
scores = scores[:, 1:]
labels = labels[:, 1:]
# batch everything, by making every class prediction be a separate instance
boxes = boxes.reshape(-1, 4)
scores = scores.reshape(-1)
labels = labels.reshape(-1)
# remove low scoring boxes
inds = torch.where(scores > self.score_thresh)[0]
boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
# remove empty boxes
keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
# non-maximum suppression, independently done per class
keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
# keep only topk scoring predictions
keep = keep[: self.detections_per_img]
boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
all_boxes.append(boxes)
all_scores.append(scores)
all_labels.append(labels)
return all_boxes, all_scores, all_labels
def forward(
self,
features, # type: Dict[str, Tensor]
proposals, # type: List[Tensor]
image_shapes, # type: List[Tuple[int, int]]
targets=None, # type: Optional[List[Dict[str, Tensor]]]
):
# type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
"""
Args:
features (List[Tensor])
proposals (List[Tensor[N, 4]])
image_shapes (List[Tuple[H, W]])
targets (List[Dict])
"""
if targets is not None:
for t in targets:
# TODO: https://github.com/pytorch/pytorch/issues/26731
floating_point_types = (torch.float, torch.double, torch.half)
assert t["boxes"].dtype in floating_point_types, "target boxes must of float type"
assert t["labels"].dtype == torch.int64, "target labels must of int64 type"
if self.has_keypoint():
assert t["keypoints"].dtype == torch.float32, "target keypoints must of float type"
if self.training:
# 获取正负样本和回归参数
proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
else:
labels = None
regression_targets = None
matched_idxs = None
# 多尺度RoI Align获取建议框的特征
box_features = self.box_roi_pool(features, proposals, image_shapes)
# 特征经过2层全连接层,将[1024, 256, 7, 7]综合特征,转化为[1024, 1024]
box_features = self.box_head(box_features)
# 获取预测框的位置信息和分类分数
class_logits, box_regression = self.box_predictor(box_features)
result: List[Dict[str, torch.Tensor]] = []
losses = {}
if self.training:
assert labels is not None and regression_targets is not None
# 计算最终的分类损失与位置损失
loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
else:
# 计算最终的分类损失与位置损失
boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
num_images = len(boxes)
for i in range(num_images):
result.append(
{
"boxes": boxes[i],
"labels": labels[i],
"scores": scores[i],
}
)
if self.has_mask():
mask_proposals = [p["boxes"] for p in result]
if self.training:
assert matched_idxs is not None
# during training, only focus on positive boxes
num_images = len(proposals)
mask_proposals = []
pos_matched_idxs = []
for img_id in range(num_images):
pos = torch.where(labels[img_id] > 0)[0]
mask_proposals.append(proposals[img_id][pos])
pos_matched_idxs.append(matched_idxs[img_id][pos])
else:
pos_matched_idxs = None
if self.mask_roi_pool is not None:
mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
mask_features = self.mask_head(mask_features)
mask_logits = self.mask_predictor(mask_features)
else:
raise Exception("Expected mask_roi_pool to be not None")
loss_mask = {}
if self.training:
assert targets is not None
assert pos_matched_idxs is not None
assert mask_logits is not None
gt_masks = [t["masks"] for t in targets]
gt_labels = [t["labels"] for t in targets]
rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
loss_mask = {"loss_mask": rcnn_loss_mask}
else:
labels = [r["labels"] for r in result]
masks_probs = maskrcnn_inference(mask_logits, labels)
for mask_prob, r in zip(masks_probs, result):
r["masks"] = mask_prob
losses.update(loss_mask)
# keep none checks in if conditional so torchscript will conditionally
# compile each branch
if (
self.keypoint_roi_pool is not None
and self.keypoint_head is not None
and self.keypoint_predictor is not None
):
keypoint_proposals = [p["boxes"] for p in result]
if self.training:
# during training, only focus on positive boxes
num_images = len(proposals)
keypoint_proposals = []
pos_matched_idxs = []
assert matched_idxs is not None
for img_id in range(num_images):
pos = torch.where(labels[img_id] > 0)[0]
keypoint_proposals.append(proposals[img_id][pos])
pos_matched_idxs.append(matched_idxs[img_id][pos])
else:
pos_matched_idxs = None
keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
keypoint_features = self.keypoint_head(keypoint_features)
keypoint_logits = self.keypoint_predictor(keypoint_features)
loss_keypoint = {}
if self.training:
assert targets is not None
assert pos_matched_idxs is not None
gt_keypoints = [t["keypoints"] for t in targets]
rcnn_loss_keypoint = keypointrcnn_loss(
keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
)
loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
else:
assert keypoint_logits is not None
assert keypoint_proposals is not None
keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
r["keypoints"] = keypoint_prob
r["keypoints_scores"] = kps
losses.update(loss_keypoint)
return result, losses
评论(0)
您还未登录,请登录后发表或查看评论