NE-Smart-Tracker/tracker/ltr/models/target_classifier/initializer.py


								import torch.nn as nn

								import torch

								from ltr.external.PreciseRoIPooling.pytorch.prroi_pool import PrRoIPool2D

								import math


								class FilterPool(nn.Module):

								    """Pool the target region in a feature map.

								    args:

								        filter_size:  Size of the filter.

								        feature_stride:  Input feature stride.

								        pool_square:  Do a square pooling instead of pooling the exact target region."""


								    def __init__(self, filter_size=1, feature_stride=16, pool_square=False):

								        super().__init__()

								        self.prroi_pool = PrRoIPool2D(filter_size, filter_size, 1/feature_stride)

								        self.pool_square = pool_square


								    def forward(self, feat, bb):

								        """Pool the regions in bb.

								        args:

								            feat:  Input feature maps. Dims (num_samples, feat_dim, H, W).

								            bb:  Target bounding boxes (x, y, w, h) in the image coords. Dims (num_samples, 4).

								        returns:

								            pooled_feat:  Pooled features. Dims (num_samples, feat_dim, wH, wW)."""


								        # Add batch_index to rois

								        bb = bb.reshape(-1,4)

								        num_images_total = bb.shape[0]

								        batch_index = torch.arange(num_images_total, dtype=torch.float32).reshape(-1, 1).to(bb.device)


								        # input bb is in format xywh, convert it to x0y0x1y1 format

								        pool_bb = bb.clone()


								        if self.pool_square:

								            bb_sz = pool_bb[:, 2:4].prod(dim=1, keepdim=True).sqrt()

								            pool_bb[:, :2] += pool_bb[:, 2:]/2 - bb_sz/2

								            pool_bb[:, 2:] = bb_sz


								        pool_bb[:, 2:4] = pool_bb[:, 0:2] + pool_bb[:, 2:4]

								        roi1 = torch.cat((batch_index, pool_bb), dim=1)


								        return self.prroi_pool(feat, roi1)


								class FilterInitializerLinear(nn.Module):

								    """Initializes a target classification filter by applying a linear conv layer and then pooling the target region.

								    args:

								        filter_size:  Size of the filter.

								        feature_dim:  Input feature dimentionality.

								        feature_stride:  Input feature stride.

								        pool_square:  Do a square pooling instead of pooling the exact target region.

								        filter_norm:  Normalize the output filter with its size in the end.

								        conv_ksz:  Kernel size of the conv layer before pooling."""


								    def __init__(self, filter_size=1, feature_dim=256, feature_stride=16, pool_square=False, filter_norm=True,

								                 conv_ksz=3, init_weights='default'):

								        super().__init__()


								        self.filter_conv = nn.Conv2d(feature_dim, feature_dim, kernel_size=conv_ksz, padding=conv_ksz // 2)

								        self.filter_pool = FilterPool(filter_size=filter_size, feature_stride=feature_stride, pool_square=pool_square)

								        self.filter_norm = filter_norm


								        # Init weights

								        for m in self.modules():

								            if isinstance(m, nn.Conv2d):

								                if init_weights == 'default':

								                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels

								                    m.weight.data.normal_(0, math.sqrt(2. / n))

								                elif init_weights == 'zero':

								                    m.weight.data.zero_()

								                if m.bias is not None:

								                    m.bias.data.zero_()

								            elif isinstance(m, nn.BatchNorm2d):

								                m.weight.data.fill_(1)

								                m.bias.data.zero_()


								    def forward(self, feat, bb):

								        """Runs the initializer module.

								        Note that [] denotes an optional dimension.

								        args:

								            feat:  Input feature maps. Dims (images_in_sequence, [sequences], feat_dim, H, W).

								            bb:  Target bounding boxes (x, y, w, h) in the image coords. Dims (images_in_sequence, [sequences], 4).

								        returns:

								            weights:  The output weights. Dims (sequences, feat_dim, wH, wW)."""


								        num_images = feat.shape[0]


								        feat = self.filter_conv(feat.reshape(-1, feat.shape[-3], feat.shape[-2], feat.shape[-1]))


								        weights = self.filter_pool(feat, bb)


								        # If multiple input images, compute the initial filter as the average filter.

								        if num_images > 1:

								            weights = torch.mean(weights.reshape(num_images, -1, weights.shape[-3], weights.shape[-2], weights.shape[-1]), dim=0)


								        if self.filter_norm:

								            weights = weights / (weights.shape[1] * weights.shape[2] * weights.shape[3])


								        return weights