retinaNet论文笔记

# 小郑之家~

### 下面以resnet18为例来研究一下代码

• basice blok


class BasicBlock(nn.Module):
expansion = 1

def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride

def forward(self, x):
residual = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)

if self.downsample is not None:
residual = self.downsample(x)

out += residual
out = self.relu(out)

return out



        x = self.conv1(img_batch)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)



        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)




x1 = self.layer1(x)   #(1,64,40,60)
x2 = self.layer2(x1)  #(1,128,40,60)
x3 = self.layer3(x2)  #(1,256,20,30)
x4 = self.layer4(x3)  #(1,512,10,15)

features = self.fpn([x2, x3, x4])



### fpn

fpn的前向计算的代码是这样的

def forward(self, inputs):

C3, C4, C5 = inputs

P5_x = self.P5_1(C5)
P5_upsampled_x = self.P5_upsampled(P5_x)
P5_x = self.P5_2(P5_x)    ##

P4_x = self.P4_1(C4)
P4_x = P5_upsampled_x + P4_x
P4_upsampled_x = self.P4_upsampled(P4_x)
P4_x = self.P4_2(P4_x)

P3_x = self.P3_1(C3)
P3_x = P3_x + P4_upsampled_x
P3_x = self.P3_2(P3_x)

P6_x = self.P6(C5)

P7_x = self.P7_1(P6_x)
P7_x = self.P7_2(P7_x)

return [P3_x, P4_x, P5_x, P6_x, P7_x]



C5除了去和C4做某些操作之外，还去产生了p6,p7 这一点有点像ssd里面的那几个multiscale的feature map类似，这里上采样采用的是最邻近的算法。真正经过上采样的只有C5,C4, 输出的5个feature,其channel是一样的。

        regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)
#print(regression.shape)

classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)



class RegressionModel(nn.Module):
def __init__(self, num_features_in, num_anchors=9, feature_size=256):
super(RegressionModel, self).__init__()

self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
self.act1 = nn.ReLU()

self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act2 = nn.ReLU()

self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act3 = nn.ReLU()

self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act4 = nn.ReLU()

self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1)

def forward(self, x):

out = self.conv1(x)
out = self.act1(out)

out = self.conv2(out)
out = self.act2(out)

out = self.conv3(out)
out = self.act3(out)

out = self.conv4(out)
out = self.act4(out)

out = self.output(out)

# out is B x C x W x H, with C = 4*num_anchors
out = out.permute(0, 2, 3, 1)

return out.contiguous().view(out.shape[0], -1, 4)



class ClassificationModel(nn.Module):
def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
super(ClassificationModel, self).__init__()
self.num_classes = num_classes
self.num_anchors = num_anchors

self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
self.act1 = nn.ReLU()

self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act2 = nn.ReLU()

self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act3 = nn.ReLU()

self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act4 = nn.ReLU()

self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1)
self.output_act = nn.Sigmoid()

def forward(self, x):

out = self.conv1(x)
out = self.act1(out)

out = self.conv2(out)
out = self.act2(out)

out = self.conv3(out)
out = self.act3(out)

out = self.conv4(out)
out = self.act4(out)

out = self.output(out)
out = self.output_act(out)

# out is B x C x W x H, with C = n_classes + n_anchors
out1 = out.permute(0, 2, 3, 1)

batch_size, width, height, channels = out1.shape

out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)

return out2.contiguous().view(x.shape[0], -1, self.num_classes)



### anchor

anchor就是直接去产生anchor了，

anchors = self.anchors(img_batch)



### focal loss 部分



# 计算gt相对于anchor的位置，这个是fast-rcnn paper里面的。
targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
targets_dw = torch.log(gt_widths / anchor_widths_pi)
targets_dh = torch.log(gt_heights / anchor_heights_pi)


regression_diff = torch.abs(targets - regression[positive_indices, :])



bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))



### 上面的地方后来发现是我自己理解错了，从头到尾细看的话，发现是对的

• balanced cross entropy

• focal loss

class FocalLoss(nn.Module):
#def __init__(self):

def forward(self, classifications, regressions, anchors, annotations):
alpha = 0.25
gamma = 2.0
batch_size = classifications.shape[0]
classification_losses = []
regression_losses = []

anchor = anchors[0, :, :]

anchor_widths  = anchor[:, 2] - anchor[:, 0]
anchor_heights = anchor[:, 3] - anchor[:, 1]
anchor_ctr_x   = anchor[:, 0] + 0.5 * anchor_widths
anchor_ctr_y   = anchor[:, 1] + 0.5 * anchor_heights

for j in range(batch_size):

classification = classifications[j, :, :]
regression = regressions[j, :, :]

bbox_annotation = annotations[j, :, :]
bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]

if bbox_annotation.shape[0] == 0:
regression_losses.append(torch.tensor(0).float().cuda())
classification_losses.append(torch.tensor(0).float().cuda())

continue

classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)

IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations

IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1

targets = torch.ones(classification.shape) * -1
targets = targets.cuda()

targets[torch.lt(IoU_max, 0.4), :] = 0

positive_indices = torch.ge(IoU_max, 0.5)

num_positive_anchors = positive_indices.sum()

assigned_annotations = bbox_annotation[IoU_argmax, :]
targets[positive_indices, :] = 0
targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1

alpha_factor = torch.ones(targets.shape).cuda() * alpha

alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)
focal_weight = alpha_factor * torch.pow(focal_weight, gamma)

bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))

# cls_loss = focal_weight * torch.pow(bce, gamma)
cls_loss = focal_weight * bce

cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())

classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))

# compute the loss for regression

if positive_indices.sum() > 0:
assigned_annotations = assigned_annotations[positive_indices, :]

anchor_widths_pi = anchor_widths[positive_indices]
anchor_heights_pi = anchor_heights[positive_indices]
anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
anchor_ctr_y_pi = anchor_ctr_y[positive_indices]

gt_widths  = assigned_annotations[:, 2] - assigned_annotations[:, 0]
gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
gt_ctr_x   = assigned_annotations[:, 0] + 0.5 * gt_widths
gt_ctr_y   = assigned_annotations[:, 1] + 0.5 * gt_heights

# clip widths to 1
gt_widths  = torch.clamp(gt_widths, min=1)
gt_heights = torch.clamp(gt_heights, min=1)

targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
targets_dw = torch.log(gt_widths / anchor_widths_pi)
targets_dh = torch.log(gt_heights / anchor_heights_pi)

targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh))
targets = targets.t()

targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()

negative_indices = 1 - positive_indices

regression_diff = torch.abs(targets - regression[positive_indices, :])

regression_loss = torch.where(
torch.le(regression_diff, 1.0 / 9.0),
0.5 * 9.0 * torch.pow(regression_diff, 2),
regression_diff - 0.5 / 9.0
)
regression_losses.append(regression_loss.mean())
else:
regression_losses.append(torch.tensor(0).float().cuda())



def calc_iou(a, b):
area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])

iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])

iw = torch.clamp(iw, min=0)
ih = torch.clamp(ih, min=0)

ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih

ua = torch.clamp(ua, min=1e-8)

intersection = iw * ih

IoU = intersection / ua

return IoU