cimp-impl/cimp/dimp_tracker.h


								#pragma once


								#include <torch/torch.h>

								#include <string>

								#include <vector>

								#include <map>

								#include <optional>


								// Forward declare model classes if headers are not included yet to avoid circular dependencies

								// Or include them if they are fundamental. For now, let's assume they will be included.

								#include "resnet/resnet.h"

								#include "classifier/classifier.h"

								#include "bb_regressor/bb_regressor.h"


								namespace cimp {


								struct DiMPTrackerParams {

								    // --- Device ---

								    // torch::Device device = torch::kCUDA; // Will be set by DiMPTracker constructor


								    // --- Input / Preprocessing ---

								    torch::IntArrayRef image_sample_size = {288, 288}; // Target size of the cropped image sample

								    std::string border_mode = "replicate"; // Border mode for patch extraction

								    double patch_max_scale_change = 1.5;   // Max scale change for multiscale sampling


								    // --- Target Model ---

								    double search_area_scale = 5.0;      // Scale factor for the search area relative to the target size

								    double target_inside_ratio = 0.2;    // Ratio for keeping target inside image boundaries


								    // --- Classifier ---

								    // Augmentation parameters (can be a sub-struct if complex)

								    struct AugmentationParams {

								        double augmentation_expansion_factor = 2.0;

								        double random_shift_factor = 0.0; // Typically 0 for DiMP, but can be non-zero

								        std::vector<double> relativeshift = {0.0, 0.0}; // Example, usually more shifts

								        std::vector<double> blur = {}; // Sigmas for Gaussian blur

								        std::vector<double> rotate = {}; // Angles for rotation

								        struct DropoutAug {

								            int num = 0; // Number of dropout samples

								            float prob = 0.0f; // Dropout probability

								        } dropout;

								    } augmentation;


								    bool use_augmentation = true;

								    int sample_memory_size = 50; // For classifier's target_boxes memory

								    int net_opt_iter = 10;       // Optimizer iterations for filter learning


								    // --- IoU Net (BB Regressor) ---

								    bool use_iou_net = true;

								    double box_jitter_pos = 0.1;    // Jitter for proposal generation (relative to square_box_sz)

								    double box_jitter_sz = 0.1;     // Jitter for proposal generation

								    int box_refinement_iter = 5;    // Iterations for box optimization

								    double box_refinement_step_length = 1.0;

								    double box_refinement_step_decay = 1.0;

								    double maximal_aspect_ratio = 5.0;

								    int iounet_k = 5; // Number of top proposals to average for final box


								    // --- Localization ---

								    double target_not_found_threshold = 0.25; // Threshold to consider target lost

								    double target_neighborhood_scale = 2.2; // Scale for masking neighborhood around max score

								    bool update_scale_when_uncertain = true;


								    // TODO: Add other parameters from DiMP Python code as needed

								    // e.g. feature_stride, kernel_size (these might be derived from network)

								};


								class DiMPTracker {

								public:

								    DiMPTracker(const DiMPTrackerParams& params,

								                const std::string& resnet_weights_dir,

								                const std::string& classifier_weights_dir,

								                const std::string& bbregressor_weights_dir,

								                torch::Device device);


								    // Initialize the tracker with the first frame and bounding box

								    // image: HWC, uint8 tensor or cv::Mat (needs conversion)

								    // initial_bbox_xywh: [x, y, w, h] tensor for the target in the first frame

								    void initialize(const torch::Tensor& image_tensor_hwc_uchar, const torch::Tensor& initial_bbox_xywh);


								    // Track the target in subsequent frames

								    // image: HWC, uint8 tensor or cv::Mat

								    // Returns: [x, y, w, h] tensor for the predicted bounding box

								    torch::Tensor track(const torch::Tensor& image_tensor_hwc_uchar);


								private:

								    // --- Core Models ---

								    cimp::resnet::ResNet resnet_model_;

								    Classifier classifier_model_; // Classifier is in global namespace

								    BBRegressor bbregressor_model_; // BBRegressor is in global namespace


								    // --- Parameters & Device ---

								    DiMPTrackerParams params_;

								    torch::Device device_;


								    // --- Tracker State ---

								    torch::Tensor pos_;             // Target position (y_center, x_center) in image coordinates

								    torch::Tensor target_sz_;       // Target size (height, width) in image coordinates

								    torch::Tensor image_sz_;        // Current image size (height, width)

								    double target_scale_;           // Current scale factor of the target

								    torch::Tensor base_target_sz_;  // Target size at scale 1.0

								    torch::Tensor img_sample_sz_;   // Size of the image sample patch (e.g., {288, 288})

								    torch::Tensor img_support_sz_;  // Usually same as img_sample_sz_


								    torch::Tensor init_sample_pos_; // Position used for generating initial samples

								    double init_sample_scale_;      // Scale used for generating initial samples


								    // Learned components

								    torch::Tensor target_filter_;   // Learned DiMP classification filter: [num_filters, C, H, W]

								    std::vector<torch::Tensor> iou_modulation_; // Learned IoU modulation vectors: list of [1, C, 1, 1]


								    // Feature/Kernel sizes (often derived during initialization)

								    torch::Tensor feature_sz_;      // Size of the classification feature map (e.g., {18, 18})

								    torch::Tensor kernel_size_;     // Size of the classification filter (e.g., {4, 4})

								    // torch::Tensor output_sz_; // output_sz = feature_sz + (kernel_size + 1)%2


								    // Augmentation transforms (might be more complex in C++)

								    // For now, logic will be in generate_init_samples

								    // std::vector<std::function<torch::Tensor(torch::Tensor)>> transforms_;


								    // Stored target boxes for classifier training

								    torch::Tensor stored_target_boxes_; // [memory_size, 4]


								    // --- Helper Methods (to be implemented in .cpp) ---

								    torch::Tensor convert_image_to_tensor_chw_float(const torch::Tensor& image_hwc_uchar);


								    std::pair<std::vector<torch::Tensor>, torch::Tensor> generate_init_samples(const torch::Tensor& image_chw_float);


								    void init_classifier_internal(const std::vector<torch::Tensor>& init_backbone_feat_list, const torch::Tensor& init_target_boxes_aug);

								    void init_iou_net_internal(const std::vector<torch::Tensor>& init_backbone_feat_list, const torch::Tensor& initial_bbox_for_iou);


								    std::pair<std::map<std::string, torch::Tensor>, torch::Tensor> extract_backbone_features(

								        const torch::Tensor& image_chw_float,

								        const torch::Tensor& pos,

								        const torch::Tensor& scales, // vector of scales

								        const torch::IntArrayRef& sample_sz);


								    torch::Tensor get_classification_features(const std::map<std::string, torch::Tensor>& backbone_feat);

								    std::vector<torch::Tensor> get_iou_backbone_features(const std::map<std::string, torch::Tensor>& backbone_feat);

								    std::vector<torch::Tensor> get_iou_features(const std::map<std::string, torch::Tensor>& backbone_feat);


								    std::pair<torch::Tensor, torch::Tensor> get_sample_location(const torch::Tensor& sample_coords_xyxy);

								    torch::Tensor get_centered_sample_pos();


								    torch::Tensor classify_target(const torch::Tensor& test_x_clf_feat);


								    struct LocalizationResult {

								        torch::Tensor translation_vec_yx; // y, x displacement

								        int64_t scale_idx;

								        torch::Tensor scores_peak_map; // The score map from the peak scale

								        std::string flag; // "normal", "not_found", "uncertain"

								    };

								    LocalizationResult localize_target(const torch::Tensor& scores_raw,

								                                       const torch::Tensor& sample_pos_yx,

								                                       const torch::Tensor& sample_scales);

								    LocalizationResult localize_advanced(const torch::Tensor& scores_scaled,

								                                         const torch::Tensor& sample_pos_yx,

								                                         const torch::Tensor& sample_scales);


								    void update_state(const torch::Tensor& new_pos_yx);

								    torch::Tensor get_iounet_box(const torch::Tensor& pos_yx, const torch::Tensor& sz_hw,

								                                 const torch::Tensor& sample_pos_yx, double sample_scale);


								    void refine_target_box(const std::map<std::string, torch::Tensor>& backbone_feat,

								                           const torch::Tensor& sample_pos_yx,

								                           double sample_scale,

								                           int64_t scale_idx,

								                           bool update_scale_flag);


								    std::pair<torch::Tensor, torch::Tensor> optimize_boxes_default(

								        const std::vector<torch::Tensor>& iou_features,

								        const torch::Tensor& init_boxes_xywh); // proposals_xywh


								    // Image processing / patch sampling helpers

								    std::pair<torch::Tensor, torch::Tensor> sample_patch_multiscale_affine(

								        const torch::Tensor& im_chw_float,

								        const torch::Tensor& pos_yx,

								        const torch::Tensor& scales, // 1D tensor of scales

								        const torch::IntArrayRef& output_sz_hw,

								        const std::string& border_mode = "replicate",

								        std::optional<double> max_scale_change = std::nullopt);


								    std::pair<torch::Tensor, torch::Tensor> sample_patch_transformed_affine(

								        const torch::Tensor& im_chw_float,

								        const torch::Tensor& pos_yx,

								        double scale,

								        const torch::IntArrayRef&aug_expansion_sz_hw, // Size of patch to extract before transform

								        const std::vector<torch::Tensor>& affine_matrices, // One 2x3 affine matrix per transform

								        const torch::IntArrayRef& out_sz_hw // Final output size after transform

								    );


								    // Augmentation helpers

								    // ...

								};


								} // namespace cimp