cimp-impl/cimp/bb_regressor/bb_regressor.cpp


								#include "bb_regressor.h"

								#include <iostream>

								#include <fstream>

								#include <torch/script.h>

								#include <torch/serialize.h>

								#include <vector>

								#include <stdexcept>

								// Add CUDA includes for required CUDA implementation

								#include <cuda_runtime.h>

								#include <ATen/cuda/CUDAContext.h>


								// Use the PrRoIPooling implementation

								#include "prroi_pooling_gpu.h"

								#include "prroi_pooling_gpu_impl.cuh"


								// PrRoIPool2D implementation (requires CUDA)

								PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale)

								    : pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale) {}


								torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) {

								    // Print shape info for debugging

								    std::cout << "  PrRoIPool2D inputs: " << std::endl;

								    std::cout << "    Features: [" << feat.size(0) << ", " << feat.size(1) << ", "

								              << feat.size(2) << ", " << feat.size(3) << "]" << std::endl;

								    std::cout << "    ROIs: [" << rois.size(0) << ", " << rois.size(1) << "]" << std::endl;

								    std::cout << "    Pooled size: [" << pooled_height_ << ", " << pooled_width_ << "]" << std::endl;

								    std::cout << "    Spatial scale: " << spatial_scale_ << std::endl;


								    // Calculate output shape

								    int channels = feat.size(1);

								    int num_rois = rois.size(0);


								    // Ensure both tensors are on CUDA initially (as they come from GPU operations)

								    if (!feat.is_cuda() || !rois.is_cuda()) {

								        // This case should ideally not happen if inputs are from CUDA model parts

								        // but if it does, move them to CUDA first for consistency, then to CPU for the C function

								        std::cout << "Warning: PrRoIPool2D received non-CUDA tensor(s). Moving to CUDA then CPU." << std::endl;

								        feat = feat.to(torch::kCUDA);

								        rois = rois.to(torch::kCUDA);

								    }


								    // Print ROI values for debugging

								    std::cout << "    ROI values (on device " << rois.device() << "): " << std::endl;

								    auto rois_cpu_for_print = rois.to(torch::kCPU).contiguous(); // Temp CPU copy for printing

								    for (int i = 0; i < std::min(num_rois, 3); i++) {

								        std::cout << "      ROI " << i << ": [";

								        for (int j = 0; j < rois_cpu_for_print.size(1); j++) {

								            std::cout << rois_cpu_for_print[i][j].item<float>();

								            if (j < rois_cpu_for_print.size(1) - 1) std::cout << ", ";

								        }

								        std::cout << "]" << std::endl;

								    }


								    // Create output tensor on the same original device as feat (CUDA)

								    auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_},

								                              feat.options());


								    // REVERTED: Copy tensors to CPU for the C implementation, as prroi_pooling_forward_cuda expects CPU pointers

								    auto feat_cpu = feat.to(torch::kCPU).contiguous();

								    auto rois_cpu = rois.to(torch::kCPU).contiguous(); // Already on CPU for printing, ensure contiguous

								    auto output_cpu = output.to(torch::kCPU).contiguous(); // Create CPU version for the C function to fill


								    // Call the C wrapper function (which is a CPU implementation)

								    std::cout << "    Calling prroi_pooling_forward_cuda (CPU implementation)..." << std::endl;

								    prroi_pooling_forward_cuda(

								        feat_cpu.data_ptr<float>(),

								        rois_cpu.data_ptr<float>(), // Pass the CPU tensor data

								        output_cpu.data_ptr<float>(), // Pass CPU output tensor data

								        channels,

								        feat.size(2),

								        feat.size(3),

								        num_rois,

								        pooled_height_,

								        pooled_width_,

								        spatial_scale_

								    );

								    std::cout << "    prroi_pooling_forward_cuda completed" << std::endl;


								    // Copy result back to original device (GPU)

								    output.copy_(output_cpu);


								    return output;

								}


								// LinearBlock implementation

								LinearBlock::LinearBlock(int in_planes, int out_planes, int input_sz, bool bias, bool batch_norm, bool relu) {

								    // Create the linear layer with proper input dimensions

								    auto linear_options = torch::nn::LinearOptions(in_planes * input_sz * input_sz, out_planes).bias(bias);

								    linear = register_module("linear", torch::nn::Linear(linear_options));


								    use_bn = batch_norm;

								    if (use_bn) {

								        // Important: use BatchNorm2d to match Python implementation

								        bn = register_module("bn", torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes)));

								        // Initialize BatchNorm weights and biases like Python

								        bn->weight.data().uniform_();

								        bn->bias.data().zero_();

								    }


								    use_relu = relu;

								    if (use_relu) {

								        relu_ = register_module("relu", torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true)));

								    }

								}


								torch::Tensor LinearBlock::forward(torch::Tensor x) {

								    // Store original dtype for later (though we will stick to it)

								    // auto original_dtype = x.dtype();


								    // REMOVED: Conversions to double precision

								    // auto x_double = x.to(torch::kFloat64);


								    // Reshape exactly as in Python: x.reshape(x.shape[0], -1)

								    // x_double = x_double.reshape({x_double.size(0), -1}).contiguous();

								    x = x.reshape({x.size(0), -1}).contiguous(); // Operate on original tensor x


								    // REMOVED: Conversion back to original precision for the linear operation

								    // auto x_float = x_double.to(original_dtype);

								    // x_float = linear->forward(x_float);

								    x = linear->forward(x); // Operate on original tensor x


								    // REMOVED: Back to double precision for further operations

								    // x_double = x_float.to(torch::kFloat64);


								    if (use_bn) {

								        // This is crucial: reshape to 4D tensor for BatchNorm2d exactly as in Python

								        // In Python: x = self.bn(x.reshape(x.shape[0], x.shape[1], 1, 1))

								        // x_double = x_double.reshape({x_double.size(0), x_double.size(1), 1, 1}).contiguous();

								        x = x.reshape({x.size(0), x.size(1), 1, 1}).contiguous(); // Operate on original tensor x


								        // Apply batch norm (convert to float32 for the operation - NOT NEEDED if x is already float32)

								        // x_float = x_double.to(original_dtype);

								        // x_float = bn->forward(x_float);

								        // x_double = x_float.to(torch::kFloat64);

								        x = bn->forward(x); // Operate on original tensor x

								    }


								    // Apply ReLU if needed

								    if (use_relu) {

								        // Apply ReLU in float32 precision - NOT NEEDED if x is already float32

								        // x_float = x_double.to(original_dtype);

								        // x_float = relu_->forward(x_float);

								        // x_double = x_float.to(torch::kFloat64);

								        x = relu_->forward(x); // Operate on original tensor x

								    }


								    // Final reshape to 2D tensor, exactly matching Python's behavior

								    // x_double = x_double.reshape({x_double.size(0), -1}).contiguous();

								    x = x.reshape({x.size(0), -1}).contiguous(); // Operate on original tensor x


								    // Return tensor in original precision

								    // return x_double.to(original_dtype);

								    return x; // Return modified x directly

								}


								// Create convolutional block

								torch::nn::Sequential BBRegressor::create_conv_block(int in_planes, int out_planes,

								                                                  int kernel_size, int stride,

								                                                  int padding, int dilation) {

								    // Print dimensions for debugging

								    std::cout << "Creating conv block: in_planes=" << in_planes << ", out_planes=" << out_planes << std::endl;


								    torch::nn::Sequential seq;


								    // Add convolutional layer

								    seq->push_back(torch::nn::Conv2d(torch::nn::Conv2dOptions(in_planes, out_planes, kernel_size)

								                          .stride(stride).padding(padding).dilation(dilation).bias(true)));


								    // Add batch normalization layer

								    auto bn_layer = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes));

								    // Initialize BatchNorm weights and biases like Python

								    bn_layer->weight.data().uniform_();

								    bn_layer->bias.data().zero_();

								    seq->push_back(bn_layer);


								    // Add ReLU activation

								    seq->push_back(torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true)));


								    return seq;

								}


								// Helper function to verify BatchNorm dimensions

								void BBRegressor::verify_batchnorm_dimensions() {

								    std::cout << "Verifying BatchNorm dimensions..." << std::endl;


								    // Get children of conv3_1r

								    std::cout << "conv3_1r has " << conv3_1r->size() << " modules" << std::endl;

								    if (conv3_1r->size() > 1) {

								        auto module = conv3_1r[1];

								        std::cout << "conv3_1r module[1] type: " << module->name() << std::endl;

								    }


								    // Get children of conv3_1t

								    std::cout << "conv3_1t has " << conv3_1t->size() << " modules" << std::endl;

								    if (conv3_1t->size() > 1) {

								        auto module = conv3_1t[1];

								        std::cout << "conv3_1t module[1] type: " << module->name() << std::endl;

								    }


								    // Get children of conv3_2t

								    std::cout << "conv3_2t has " << conv3_2t->size() << " modules" << std::endl;

								    if (conv3_2t->size() > 1) {

								        auto module = conv3_2t[1];

								        std::cout << "conv3_2t module[1] type: " << module->name() << std::endl;

								    }

								}


								// Helper function to read file to bytes

								std::vector<char> BBRegressor::read_file_to_bytes(const std::string& file_path) {

								    std::ifstream file(file_path, std::ios::binary | std::ios::ate);

								    if (!file.is_open()) {

								        throw std::runtime_error("Could not open file: " + file_path);

								    }


								    std::streamsize size = file.tellg();

								    file.seekg(0, std::ios::beg);


								    std::vector<char> buffer(size);

								    if (!file.read(buffer.data(), size)) {

								        throw std::runtime_error("Could not read file: " + file_path);

								    }


								    return buffer;

								}


								// Load tensor from file

								torch::Tensor BBRegressor::load_tensor(const std::string& file_path) {

								    try {

								        // Read file into bytes first

								        std::vector<char> data = read_file_to_bytes(file_path);

								        // Use pickle_load with byte data

								        torch::Tensor tensor = torch::pickle_load(data).toTensor();


								        // Always move tensor to the specified device

								        return tensor.to(device);

								    } catch (const c10::Error& e) {

								        std::cerr << "Error loading tensor from " << file_path << ": " << e.what() << std::endl;

								        throw;

								    }

								}


								// Constructor

								BBRegressor::BBRegressor(const std::string& model_weights_dir, torch::Device dev)

								    : device(dev), model_dir(model_weights_dir),

								      fc3_rt(256, 256, 5, true, true, true),

								      fc4_rt(256, 256, 3, true, true, true) {


								    // Check if model directory exists

								    if (!fs::exists(model_dir)) {

								        throw std::runtime_error("Model directory does not exist: " + model_dir);

								    }


								    // Initialize convolution blocks - match Python's AtomIoUNet implementation exactly

								    std::cout << "Initializing conv blocks..." << std::endl;


								    // In Python: self.conv3_1r = conv(input_dim[0], 128, kernel_size=3, stride=1)

								    conv3_1r = create_conv_block(512, 128, 3, 1, 1, 1);


								    // In Python: self.conv3_1t = conv(input_dim[0], 256, kernel_size=3, stride=1)

								    conv3_1t = create_conv_block(512, 256, 3, 1, 1, 1);


								    // In Python: self.conv3_2t = conv(256, pred_input_dim[0], kernel_size=3, stride=1)

								    conv3_2t = create_conv_block(256, 256, 3, 1, 1, 1);


								    // Update pooling sizes to match the Python model exactly

								    // In Python: self.prroi_pool3r = PrRoIPool2D(3, 3, 1/8)

								    prroi_pool3r = std::make_shared<PrRoIPool2D>(3, 3, 0.125); // 1/8 scale for layer2


								    // In Python: self.prroi_pool3t = PrRoIPool2D(5, 5, 1/8)

								    prroi_pool3t = std::make_shared<PrRoIPool2D>(5, 5, 0.125); // 1/8 scale for layer2


								    // Create sequential blocks

								    // In Python: self.fc3_1r = conv(128, 256, kernel_size=3, stride=1, padding=0)

								    fc3_1r = create_conv_block(128, 256, 3, 1, 0, 1);  // padding=0 for this layer


								    // In Python: self.conv4_1r = conv(input_dim[1], 256, kernel_size=3, stride=1)

								    conv4_1r = create_conv_block(1024, 256, 3, 1, 1, 1);


								    // In Python: self.conv4_1t = conv(input_dim[1], 256, kernel_size=3, stride=1)

								    conv4_1t = create_conv_block(1024, 256, 3, 1, 1, 1);


								    // In Python: self.conv4_2t = conv(256, pred_input_dim[1], kernel_size=3, stride=1)

								    conv4_2t = create_conv_block(256, 256, 3, 1, 1, 1);


								    // In Python: self.prroi_pool4r = PrRoIPool2D(1, 1, 1/16)

								    prroi_pool4r = std::make_shared<PrRoIPool2D>(1, 1, 0.0625); // 1/16 scale for layer3


								    // In Python: self.prroi_pool4t = PrRoIPool2D(3, 3, 1/16)

								    prroi_pool4t = std::make_shared<PrRoIPool2D>(3, 3, 0.0625); // 1/16 scale for layer3


								    // In Python: self.fc34_3r = conv(256 + 256, pred_input_dim[0], kernel_size=1, stride=1, padding=0)

								    fc34_3r = create_conv_block(512, 256, 1, 1, 0, 1);  // kernel_size=1, padding=0


								    // In Python: self.fc34_4r = conv(256 + 256, pred_input_dim[1], kernel_size=1, stride=1, padding=0)

								    fc34_4r = create_conv_block(512, 256, 1, 1, 0, 1);  // kernel_size=1, padding=0


								    // Linear blocks - exactly match Python's implementation dimensions and parameters

								    // In Python: self.fc3_rt = LinearBlock(pred_input_dim[0], pred_inter_dim[0], 5)

								    fc3_rt = LinearBlock(256, 256, 5, true, true, true);


								    // In Python: self.fc4_rt = LinearBlock(pred_input_dim[1], pred_inter_dim[1], 3)

								    fc4_rt = LinearBlock(256, 256, 3, true, true, true);


								    // In Python: self.iou_predictor = nn.Linear(pred_inter_dim[0]+pred_inter_dim[1], 1, bias=True)

								    iou_predictor = torch::nn::Linear(torch::nn::LinearOptions(256 + 256, 1).bias(true));


								    // Load all weights

								    load_weights();


								    // Set the model to evaluation mode

								    this->eval();


								    // Debug information

								    std::cout << "BB Regressor initialized in evaluation mode" << std::endl;

								}


								// Set the model to evaluation mode

								void BBRegressor::eval() {

								    // Set all sequential modules to eval mode

								    conv3_1r->eval();

								    conv3_1t->eval();

								    conv3_2t->eval();

								    fc3_1r->eval();

								    conv4_1r->eval();

								    conv4_1t->eval();

								    conv4_2t->eval();

								    fc34_3r->eval();

								    fc34_4r->eval();


								    // Linear blocks also need to be set to eval mode for BatchNorm layers

								    fc3_rt.eval();

								    fc4_rt.eval();


								    // Set linear layers to eval mode (though this usually doesn't have any effect)

								    iou_predictor->eval();

								}


								// Load weights

								void BBRegressor::load_weights() {

								    // Helper lambda to load weights for a sequential module

								    auto load_sequential_weights = [this](torch::nn::Sequential& seq, const std::string& prefix) {

								        try {

								            // Load weights for conv layer (index 0)

								            std::string weight_path = model_dir + "/" + prefix + "_0_weight.pt";

								            std::string bias_path = model_dir + "/" + prefix + "_0_bias.pt";


								            if (fs::exists(weight_path) && fs::exists(bias_path)) {

								                auto conv_weight = load_tensor(weight_path);

								                auto conv_bias = load_tensor(bias_path);


								                // Get the conv2d module from sequential

								                // Fix: Get the number of output channels from the weight tensor

								                int out_channels = conv_weight.size(0);

								                int in_channels = conv_weight.size(1);

								                int kernel_size = conv_weight.size(2);


								                std::cout << "Loading " << prefix << " conv weights: "

								                          << "[out_ch=" << out_channels

								                          << ", in_ch=" << in_channels

								                          << ", kernel=" << kernel_size << "]" << std::endl;


								                // FIXED: Use the correct padding based on the layer name

								                int padding = 1; // Default padding


								                // Special cases for layers with different padding

								                if (prefix == "fc3_1r" || prefix == "fc34_3r" || prefix == "fc34_4r") {

								                    padding = 0; // These layers use padding=0 in the Python implementation

								                }


								                std::cout << "  Using padding=" << padding << " for " << prefix << std::endl;


								                auto conv_options = torch::nn::Conv2dOptions(in_channels, out_channels, kernel_size)

								                    .stride(1).padding(padding).bias(true);

								                auto conv_module = torch::nn::Conv2d(conv_options);


								                // Set weights and bias

								                conv_module->weight = conv_weight;

								                conv_module->bias = conv_bias;


								                // Debug info - print some weight stats

								                std::cout << "  Conv weight stats: mean=" << conv_weight.mean().item<float>()

								                          << ", std=" << conv_weight.std().item<float>()

								                          << ", min=" << conv_weight.min().item<float>()

								                          << ", max=" << conv_weight.max().item<float>() << std::endl;


								                // Create a new sequence with the proper conv module

								                auto new_seq = torch::nn::Sequential();

								                new_seq->push_back(conv_module);


								                // Load batch norm parameters (index 1)

								                std::string bn_weight_path = model_dir + "/" + prefix + "_1_weight.pt";

								                std::string bn_bias_path = model_dir + "/" + prefix + "_1_bias.pt";

								                std::string bn_mean_path = model_dir + "/" + prefix + "_1_running_mean.pt";

								                std::string bn_var_path = model_dir + "/" + prefix + "_1_running_var.pt";


								                if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) &&

								                    fs::exists(bn_mean_path) && fs::exists(bn_var_path)) {


								                    auto bn_weight = load_tensor(bn_weight_path);

								                    auto bn_bias = load_tensor(bn_bias_path);

								                    auto bn_mean = load_tensor(bn_mean_path);

								                    auto bn_var = load_tensor(bn_var_path);


								                    // Important: Create BatchNorm with the correct number of features from the weights

								                    int num_features = bn_weight.size(0);

								                    std::cout << "  Creating BatchNorm2d with num_features=" << num_features << std::endl;


								                    // Create a proper batch norm module with the right number of features

								                    auto bn_options = torch::nn::BatchNorm2dOptions(num_features)

								                                      .eps(1e-5)  // Match Python default

								                                      .momentum(0.1)  // Match Python default

								                                      .affine(true)

								                                      .track_running_stats(true);

								                    auto bn_module = torch::nn::BatchNorm2d(bn_options);


								                    // Set batch norm parameters

								                    bn_module->weight = bn_weight;

								                    bn_module->bias = bn_bias;

								                    bn_module->running_mean = bn_mean;

								                    bn_module->running_var = bn_var;


								                    // Debug info - print some batch norm stats

								                    std::cout << "  BN weight stats: mean=" << bn_weight.mean().item<float>()

								                              << ", std=" << bn_weight.std().item<float>() << std::endl;

								                    std::cout << "  BN running_mean stats: mean=" << bn_mean.mean().item<float>()

								                              << ", std=" << bn_mean.std().item<float>() << std::endl;

								                    std::cout << "  BN running_var stats: mean=" << bn_var.mean().item<float>()

								                              << ", std=" << bn_var.std().item<float>() << std::endl;


								                    // Add the batch norm module to the sequence

								                    new_seq->push_back(bn_module);

								                }


								                // Add the ReLU module with inplace=true to match Python

								                auto relu_options = torch::nn::ReLUOptions().inplace(true);

								                new_seq->push_back(torch::nn::ReLU(relu_options));


								                // Replace the old sequence with the new one

								                seq = new_seq;


								                std::cout << "Loaded weights for " << prefix << std::endl;

								            } else {

								                std::cerr << "Weight files not found for " << prefix << std::endl;

								            }

								        } catch (const std::exception& e) {

								            std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl;

								            throw; // Re-throw to stop execution

								        }

								    };


								    // Load weights for linear blocks

								    auto load_linear_block_weights = [this](LinearBlock& block, const std::string& prefix) {

								        try {

								            // Load weights for linear layer

								            std::string weight_path = model_dir + "/" + prefix + "_linear_weight.pt";

								            std::string bias_path = model_dir + "/" + prefix + "_linear_bias.pt";


								            if (fs::exists(weight_path) && fs::exists(bias_path)) {

								                auto linear_weight = load_tensor(weight_path);

								                auto linear_bias = load_tensor(bias_path);


								                // Set weights and bias

								                block.linear->weight = linear_weight;

								                block.linear->bias = linear_bias;


								                // Load batch norm parameters

								                std::string bn_weight_path = model_dir + "/" + prefix + "_bn_weight.pt";

								                std::string bn_bias_path = model_dir + "/" + prefix + "_bn_bias.pt";

								                std::string bn_mean_path = model_dir + "/" + prefix + "_bn_running_mean.pt";

								                std::string bn_var_path = model_dir + "/" + prefix + "_bn_running_var.pt";


								                if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) &&

								                    fs::exists(bn_mean_path) && fs::exists(bn_var_path)) {

								                    auto bn_weight = load_tensor(bn_weight_path);

								                    auto bn_bias = load_tensor(bn_bias_path);

								                    auto bn_mean = load_tensor(bn_mean_path);

								                    auto bn_var = load_tensor(bn_var_path);


								                    // Set batch norm parameters

								                    block.bn->weight = bn_weight;

								                    block.bn->bias = bn_bias;

								                    block.bn->running_mean = bn_mean;

								                    block.bn->running_var = bn_var;

								                }


								                std::cout << "Loaded weights for " << prefix << std::endl;

								            } else {

								                std::cerr << "Weight files not found for " << prefix << std::endl;

								            }

								        } catch (const std::exception& e) {

								            std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl;

								            throw; // Re-throw to stop execution

								        }

								    };


								    // Load weights for all layers

								    load_sequential_weights(conv3_1r, "conv3_1r");

								    load_sequential_weights(conv3_1t, "conv3_1t");

								    load_sequential_weights(conv3_2t, "conv3_2t");

								    load_sequential_weights(fc3_1r, "fc3_1r");

								    load_sequential_weights(conv4_1r, "conv4_1r");

								    load_sequential_weights(conv4_1t, "conv4_1t");

								    load_sequential_weights(conv4_2t, "conv4_2t");

								    load_sequential_weights(fc34_3r, "fc34_3r");

								    load_sequential_weights(fc34_4r, "fc34_4r");


								    load_linear_block_weights(fc3_rt, "fc3_rt");

								    load_linear_block_weights(fc4_rt, "fc4_rt");


								    // Load IoU predictor weights

								    try {

								        std::string weight_path = model_dir + "/iou_predictor_weight.pt";

								        std::string bias_path = model_dir + "/iou_predictor_bias.pt";


								        if (fs::exists(weight_path) && fs::exists(bias_path)) {

								            auto weight = load_tensor(weight_path);

								            auto bias = load_tensor(bias_path);


								            iou_predictor->weight = weight;

								            iou_predictor->bias = bias;


								            std::cout << "Loaded weights for iou_predictor" << std::endl;

								        } else {

								            std::cerr << "Weight files not found for iou_predictor" << std::endl;

								        }

								    } catch (const std::exception& e) {

								        std::cerr << "Error loading weights for iou_predictor: " << e.what() << std::endl;

								        throw; // Re-throw to stop execution

								    }

								}


								// Move model to device

								void BBRegressor::to(torch::Device device) {

								    // Verify the device is a CUDA device

								    if (!device.is_cuda()) {

								        throw std::runtime_error("BBRegressor requires a CUDA device");

								    }


								    this->device = device;


								    // Move all components to device

								    conv3_1r->to(device);

								    conv3_1t->to(device);

								    conv3_2t->to(device);

								    fc3_1r->to(device);

								    conv4_1r->to(device);

								    conv4_1t->to(device);

								    conv4_2t->to(device);

								    fc3_rt.to(device);

								    fc4_rt.to(device);


								    iou_predictor->to(device);

								}


								// Get IoU features from backbone features

								std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat_in) {

								    torch::NoGradGuard no_grad;


								    if (feat_in.size() != 2) {

								        throw std::runtime_error("get_iou_feat expects 2 input features (layer2, layer3).");

								    }


								    // feat_in[0] is backbone layer2 (e.g., [B, 512, H1, W1])

								    // feat_in[1] is backbone layer3 (e.g., [B, 1024, H2, W2])

								    auto feat3_t_in = feat_in[0].to(device);

								    auto feat4_t_in = feat_in[1].to(device);


								    // Process through conv layers

								    // conv3_1t should take 512 -> 256 channels

								    // conv3_2t should take 256 -> 256 channels (pred_input_dim[0])

								    auto c3_t = conv3_2t->forward(conv3_1t->forward(feat3_t_in));


								    // conv4_1t should take 1024 -> 256 channels

								    // conv4_2t should take 256 -> 256 channels (pred_input_dim[1])

								    auto c4_t = conv4_2t->forward(conv4_1t->forward(feat4_t_in));


								    return {c3_t.contiguous(), c4_t.contiguous()};

								}


								// Get modulation vectors for the target

								std::vector<torch::Tensor> BBRegressor::get_modulation(std::vector<torch::Tensor> feat_in, torch::Tensor bb_in) {

								    torch::NoGradGuard no_grad;


								    auto feat3_r_in = feat_in[0].to(device); // Backbone layer2 features, e.g., [1, 512, H1, W1]

								    auto feat4_r_in = feat_in[1].to(device); // Backbone layer3 features, e.g., [1, 1024, H2, W2]

								    auto bb = bb_in.to(device);         // Target bounding box, e.g., [1, 1, 4] (x,y,w,h)


								    // Ensure bb is [batch_size, 1, 4] then reshape to [batch_size, 4] for PrRoIPooling

								    // (as PrRoIPooling expects [batch_idx, x1, y1, x2, y2])

								    if (bb.dim() == 3 && bb.size(1) == 1) {

								        bb = bb.squeeze(1); // Now [batch_size, 4]

								    } else if (bb.dim() != 2 || bb.size(1) != 4) {

								        throw std::runtime_error("get_modulation: bb must be [batch, 1, 4] or [batch, 4]");

								    }


								    // Python: c3_r = self.conv3_1r(feat3_r)

								    auto c3_r = conv3_1r->forward(feat3_r_in).contiguous(); // Output: [B, 128, H1, W1]


								    // Python: roi1 from bb (batch_idx, x1,y1,x2,y2)

								    auto batch_size = bb.size(0);

								    auto roi1 = torch::zeros({batch_size, 5}, bb.options());

								    for (int64_t i = 0; i < batch_size; ++i) {

								        roi1.index_put_({i, 0}, static_cast<float>(i));

								    }

								    roi1.index_put_({torch::indexing::Slice(), 1}, bb.index({torch::indexing::Slice(), 0})); // x1

								    roi1.index_put_({torch::indexing::Slice(), 2}, bb.index({torch::indexing::Slice(), 1})); // y1

								    roi1.index_put_({torch::indexing::Slice(), 3}, bb.index({torch::indexing::Slice(), 0}) + bb.index({torch::indexing::Slice(), 2})); // x2

								    roi1.index_put_({torch::indexing::Slice(), 4}, bb.index({torch::indexing::Slice(), 1}) + bb.index({torch::indexing::Slice(), 3})); // y2


								    // Python: roi3r = self.prroi_pool3r(c3_r, roi1)

								    // prroi_pool3r is (3,3, 1/8)

								    auto roi3r = prroi_pool3r->forward(c3_r, roi1).contiguous(); // Output: [B, 128, 3, 3]


								    // Python: c4_r = self.conv4_1r(feat4_r)

								    auto c4_r = conv4_1r->forward(feat4_r_in).contiguous(); // Output: [B, 256, H2, W2]

								    // Python: roi4r = self.prroi_pool4r(c4_r, roi1)

								    // prroi_pool4r is (1,1, 1/16)

								    auto roi4r = prroi_pool4r->forward(c4_r, roi1).contiguous(); // Output: [B, 256, 1, 1]


								    // Python: fc3_r = self.fc3_1r(roi3r)

								    // fc3_1r is conv(128, 256, kernel_size=3, stride=1, padding=0)

								    auto fc3_r = fc3_1r->forward(roi3r).contiguous(); // Output: [B, 256, 1, 1] (due to 3x3 kernel, padding 0 on 3x3 input)


								    // Python: fc34_r = torch.cat((fc3_r, roi4r), dim=1)

								    auto fc34_r = torch::cat({fc3_r, roi4r}, 1).contiguous(); // Output: [B, 256+256=512, 1, 1]


								    // Python: fc34_3_r = self.fc34_3r(fc34_r)

								    // fc34_3r is conv(512, 256, kernel_size=1, stride=1, padding=0)

								    auto fc34_3_r_out = fc34_3r->forward(fc34_r).contiguous(); // Output: [B, 256, 1, 1]


								    // Python: fc34_4_r = self.fc34_4r(fc34_r)

								    // fc34_4r is conv(512, 256, kernel_size=1, stride=1, padding=0)

								    auto fc34_4_r_out = fc34_4r->forward(fc34_r).contiguous(); // Output: [B, 256, 1, 1]


								    std::cout << "  get_modulation output shapes: " << std::endl;

								    std::cout << "    fc34_3_r_out: " << fc34_3_r_out.sizes() << std::endl;

								    std::cout << "    fc34_4_r_out: " << fc34_4_r_out.sizes() << std::endl;


								    return {fc34_3_r_out, fc34_4_r_out};

								}


								// Predict IoU for proposals

								torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation,

								                                     std::vector<torch::Tensor> feat,

								                                     torch::Tensor proposals) {

								    // Debug dimensions

								    std::cout << "Input dimensions:" << std::endl;

								    std::cout << "  modulation[0]: [" << modulation[0].size(0) << ", " << modulation[0].size(1) << "]" << std::endl;

								    std::cout << "  modulation[1]: [" << modulation[1].size(0) << ", " << modulation[1].size(1) << "]" << std::endl;

								    std::cout << "  feat[0]: [" << feat[0].size(0) << ", " << feat[0].size(1) << ", "

								              << feat[0].size(2) << ", " << feat[0].size(3) << "]" << std::endl;

								    std::cout << "  feat[1]: [" << feat[1].size(0) << ", " << feat[1].size(1) << ", "

								              << feat[1].size(2) << ", " << feat[1].size(3) << "]" << std::endl;

								    std::cout << "  proposals: [" << proposals.size(0) << ", " << proposals.size(1) << ", " << proposals.size(2) << "]" << std::endl;


								    // Convert proposals from [batch, num_proposals, 4] to [num_proposals, 5] format

								    // with batch index as the first element

								    auto batch_size = proposals.size(0);

								    auto num_proposals = proposals.size(1);


								    // Reshape proposals to [num_proposals, 4]

								    auto proposals_view = proposals.reshape({-1, 4});


								    // Create batch indices tensor [0, 0, 0, ...] for all proposals

								    auto batch_indices = torch::zeros({num_proposals, 1}, proposals.options());


								    // Convert proposals from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format

								    auto roi = torch::zeros({num_proposals, 5}, proposals.options());

								    roi.index_put_({torch::indexing::Slice(), 0}, batch_indices.squeeze());

								    roi.index_put_({torch::indexing::Slice(), 1}, proposals_view.index({torch::indexing::Slice(), 0}));

								    roi.index_put_({torch::indexing::Slice(), 2}, proposals_view.index({torch::indexing::Slice(), 1}));


								    // Calculate x2, y2 from width and height

								    auto x2 = proposals_view.index({torch::indexing::Slice(), 0}) + proposals_view.index({torch::indexing::Slice(), 2});

								    auto y2 = proposals_view.index({torch::indexing::Slice(), 1}) + proposals_view.index({torch::indexing::Slice(), 3});

								    roi.index_put_({torch::indexing::Slice(), 3}, x2);

								    roi.index_put_({torch::indexing::Slice(), 4}, y2);


								    // Make sure ROI is on the same device as features

								    torch::Device feat_device = feat[0].device();

								    roi = roi.to(feat_device);


								    // Apply ROI pooling to get features for each proposal

								    // CORRECTED: Use prroi_pool3t and prroi_pool4t

								    auto pooled_feat1 = prroi_pool3t->forward(feat[0], roi); // Was prroi_pool3r

								    auto pooled_feat2 = prroi_pool4t->forward(feat[1], roi); // Was prroi_pool4r


								    // Make sure all tensors are on the same device (GPU)

								    torch::Device target_device = modulation[0].device();

								    pooled_feat1 = pooled_feat1.to(target_device);

								    pooled_feat2 = pooled_feat2.to(target_device);


								    // Print intermediate tensor shapes

								    std::cout << "  Pooled shapes:" << std::endl;

								    std::cout << "    pooled_feat1: [" << pooled_feat1.size(0) << ", " << pooled_feat1.size(1) << ", "

								              << pooled_feat1.size(2) << ", " << pooled_feat1.size(3) << "]" << std::endl;

								    std::cout << "    pooled_feat2: [" << pooled_feat2.size(0) << ", " << pooled_feat2.size(1) << ", "

								              << pooled_feat2.size(2) << ", " << pooled_feat2.size(3) << "]" << std::endl;


								    // Inspect the IoU predictor dimensions

								    std::cout << "  IoU predictor dimensions:" << std::endl;

								    std::cout << "    weight: [" << iou_predictor->weight.size(0) << ", " << iou_predictor->weight.size(1) << "]" << std::endl;

								    std::cout << "    bias: [" << iou_predictor->bias.size(0) << "]" << std::endl;


								    try {

								        // CORRECTED: Process pooled features through fc3_rt and fc4_rt (LinearBlocks)

								        // These will handle the reshape and linear transformation.

								        // pooled_feat1 is [B*N, 256, 5, 5] -> fc3_rt -> [B*N, 256]

								        // pooled_feat2 is [B*N, 256, 3, 3] -> fc4_rt -> [B*N, 256]

								        std::cout << "  Applying fc3_rt to pooled_feat1 (shape: " << pooled_feat1.sizes() << ")" << std::endl;

								        auto mod_target_0 = fc3_rt.forward(pooled_feat1);

								        std::cout << "  Applying fc4_rt to pooled_feat2 (shape: " << pooled_feat2.sizes() << ")" << std::endl;

								        auto mod_target_1 = fc4_rt.forward(pooled_feat2);


								        std::cout << "  mod_target_0 shape: " << mod_target_0.sizes() << std::endl;

								        std::cout << "  mod_target_1 shape: " << mod_target_1.sizes() << std::endl;


								        // Print flattened shapes

								        // std::cout << "  Flattened shapes:" << std::endl;

								        // std::cout << "    vec1: [" << vec1.size(0) << ", " << vec1.size(1) << "]" << std::endl;

								        // std::cout << "    vec2: [" << vec2.size(0) << ", " << vec2.size(1) << "]" << std::endl;


								        // We need to adapt the input to match what the IoU predictor expects

								        // The IoU predictor has a weight matrix of size 512x1, so input should have 512 features


								        // Instead of concatenating the full features, we need to first reduce them to match expected size

								        // This is based on the original Python implementation


								        // Get modulation shapes

								        std::cout << "  Modulation vector shapes (from get_modulation):" << std::endl;

								        std::cout << "    mod1 (input arg): [" << modulation[0].size(0) << ", " << modulation[0].size(1);

								        if (modulation[0].dim() > 2) std::cout << ", " << modulation[0].size(2) << ", " << modulation[0].size(3);

								        std::cout << "]" << std::endl;

								        std::cout << "    mod2 (input arg): [" << modulation[1].size(0) << ", " << modulation[1].size(1);

								        if (modulation[1].dim() > 2) std::cout << ", " << modulation[1].size(2) << ", " << modulation[1].size(3);

								        std::cout << "]" << std::endl;


								        // Calculate expected dimensions

								        // int mod1_dim = modulation[0].size(1);  // Should be 256

								        // int mod2_dim = modulation[1].size(1);  // Should be 256

								        // int total_mod_dim = mod1_dim + mod2_dim;  // Should be 512, matching iou_predictor weight row count


								        // std::cout << "  Using correct input dimensions for IoU predictor (total_dim=" << total_mod_dim << ")" << std::endl;


								        // Create processed features with correct dimensions

								        // auto processed_feat1 = torch::zeros({num_proposals, mod1_dim}, vec1.options());

								        // auto processed_feat2 = torch::zeros({num_proposals, mod2_dim}, vec2.options());


								        // REMOVED Manual Averaging Logic

								        // We'll use average pooling across spatial dimensions

								        // if (vec1.size(1) > mod1_dim) {

								        //     // Average every N values to reduce dimension

								        //     int pool_size = vec1.size(1) / mod1_dim;

								        //     std::cout << "  Reducing vec1 features with pool_size=" << pool_size << std::endl;


								        //     for (int i = 0; i < num_proposals; i++) {

								        //         for (int j = 0; j < mod1_dim; j++) {

								        //             float sum = 0.0f;

								        //             for (int k = 0; k < pool_size; k++) {

								        //                 int idx = j * pool_size + k;

								        //                 if (idx < vec1.size(1)) {

								        //                     sum += vec1[i][idx].item<float>();

								        //                 }

								        //             }

								        //             processed_feat1[i][j] = sum / pool_size;

								        //         }

								        //     }

								        // } else {

								        //     // Just copy directly if dimensions already match

								        //     processed_feat1 = vec1;

								        // }


								        // if (vec2.size(1) > mod2_dim) {

								        //     // Similar reduction for vec2

								        //     int pool_size = vec2.size(1) / mod2_dim;

								        //     std::cout << "  Reducing vec2 features with pool_size=" << pool_size << std::endl;


								        //     for (int i = 0; i < num_proposals; i++) {

								        //         for (int j = 0; j < mod2_dim; j++) {

								        //             float sum = 0.0f;

								        //             for (int k = 0; k < pool_size; k++) {

								        //                 int idx = j * pool_size + k;

								        //                 if (idx < vec2.size(1)) {

								        //                     sum += vec2[i][idx].item<float>();

								        //                 }

								        //             }

								        //             processed_feat2[i][j] = sum / pool_size;

								        //         }

								        //     }

								        // } else {

								        //     // Just copy directly if dimensions already match

								        //     processed_feat2 = vec2;

								        // }


								        // Prepare modulation vectors for each proposal

								        auto m0_in = modulation[0]; // Shape can be [1, 256] or [1, 256, 1, 1]

								        auto m1_in = modulation[1];


								        if (m0_in.dim() == 4 && m0_in.size(2) == 1 && m0_in.size(3) == 1) {

								            m0_in = m0_in.squeeze(-1).squeeze(-1); // Now [1, 256]

								        }

								        if (m1_in.dim() == 4 && m1_in.size(2) == 1 && m1_in.size(3) == 1) {

								            m1_in = m1_in.squeeze(-1).squeeze(-1); // Now [1, 256]

								        }


								        // Now m0_in and m1_in are guaranteed to be 2D [Batch, Channels] e.g. [1, 256]

								        auto mod1_repeated_for_proposals = m0_in.repeat({num_proposals, 1}); // [num_proposals, 256]

								        auto mod2_repeated_for_proposals = m1_in.repeat({num_proposals, 1}); // [num_proposals, 256]


								        std::cout << "  Final feature shapes (after LinearBlocks, before element-wise mult):" << std::endl;

								        std::cout << "    mod_target_0 (from fc3_rt): [" << mod_target_0.size(0) << ", " << mod_target_0.size(1) << "]" << std::endl;

								        std::cout << "    mod_target_1 (from fc4_rt): [" << mod_target_1.size(0) << ", " << mod_target_1.size(1) << "]" << std::endl;

								        std::cout << "    mod1_repeated (from get_modulation input): [" << mod1_repeated_for_proposals.size(0) << ", " << mod1_repeated_for_proposals.size(1) << "]" << std::endl;

								        std::cout << "    mod2_repeated (from get_modulation input): [" << mod2_repeated_for_proposals.size(0) << ", " << mod2_repeated_for_proposals.size(1) << "]" << std::endl;


								        // Element-wise multiply features with modulation vectors

								        // CORRECTED: Use mod_target_0 and mod_target_1 from fc3_rt/fc4_rt

								        auto mod_feat1 = mod_target_0 * mod1_repeated_for_proposals;

								        auto mod_feat2 = mod_target_1 * mod2_repeated_for_proposals;


								        // Concatenate to get final features for IoU prediction

								        auto ioufeat = torch::cat({mod_feat1, mod_feat2}, /*dim=*/1);

								        std::cout << "  ioufeat shape: [" << ioufeat.size(0) << ", " << ioufeat.size(1) << "]" << std::endl;


								        // Try GPU implementation first

								        torch::Tensor iou_scores;

								        try {

								            // Apply IoU predictor using GPU

								            std::cout << "  Applying IoU predictor on GPU" << std::endl;

								            iou_scores = iou_predictor->forward(ioufeat);

								        } catch (const std::exception& cuda_error) {

								            // If GPU implementation fails, use CPU implementation

								            std::cout << "  GPU implementation failed: " << cuda_error.what() << std::endl;

								            std::cout << "  Falling back to CPU implementation" << std::endl;


								            // Move tensors to CPU

								            auto ioufeat_cpu = ioufeat.to(torch::kCPU);

								            auto weight_cpu = iou_predictor->weight.to(torch::kCPU);

								            auto bias_cpu = iou_predictor->bias.to(torch::kCPU);


								            // Implement the linear layer manually

								            // For each proposal, compute: score = bias + ioufeat * weight

								            auto scores_cpu = torch::zeros({num_proposals, 1}, torch::kCPU);


								            for (int i = 0; i < num_proposals; i++) {

								                // Start with bias

								                float score = bias_cpu[0].item<float>();


								                // Add weighted sum of features

								                for (int j = 0; j < ioufeat_cpu.size(1); j++) {

								                    score += ioufeat_cpu[i][j].item<float>() * weight_cpu[0][j].item<float>();

								                }


								                scores_cpu[i][0] = score;

								            }


								            // Move results back to original device

								            iou_scores = scores_cpu.to(target_device);

								        }


								        std::cout << "  iou_scores raw shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;


								        // Reshape back to [batch_size, num_proposals]

								        iou_scores = iou_scores.reshape({batch_size, num_proposals});

								        std::cout << "  Final iou_scores shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;


								        return iou_scores;


								    } catch (const std::exception& e) {

								        // This should never happen with our robust implementation

								        std::cerr << "CRITICAL: Unexpected error in predict_iou: " << e.what() << std::endl;


								        // We'll implement direct box overlaps as a true fallback that doesn't use "magic numbers"

								        std::cout << "  Implementing direct IoU calculation using box overlaps" << std::endl;


								        // Move tensors to CPU for direct calculation

								        auto proposals_cpu = proposals.to(torch::kCPU);

								        auto bb_cpu = modulation[0].to(torch::kCPU); // Using modulation[0] to get the original target box


								        // Create output tensor on CPU

								        auto iou_scores = torch::zeros({batch_size, num_proposals}, torch::kCPU);


								        // Calculate IoU geometrically for each proposal

								        // This is a direct, mathematical implementation that doesn't rely on neural networks

								        for (int i = 0; i < num_proposals; i++) {

								            float target_x1 = proposals_view[i][0].item<float>();

								            float target_y1 = proposals_view[i][1].item<float>();

								            float target_x2 = target_x1 + proposals_view[i][2].item<float>();

								            float target_y2 = target_y1 + proposals_view[i][3].item<float>();


								            float box_x1 = bb_cpu[0][0].item<float>();

								            float box_y1 = bb_cpu[0][1].item<float>();

								            float box_x2 = box_x1 + bb_cpu[0][2].item<float>();

								            float box_y2 = box_y1 + bb_cpu[0][3].item<float>();


								            // Calculate intersection area

								            float x_left = std::max(target_x1, box_x1);

								            float y_top = std::max(target_y1, box_y1);

								            float x_right = std::min(target_x2, box_x2);

								            float y_bottom = std::min(target_y2, box_y2);


								            float intersection_area = std::max(0.0f, x_right - x_left) * std::max(0.0f, y_bottom - y_top);


								            // Calculate union area

								            float target_area = (target_x2 - target_x1) * (target_y2 - target_y1);

								            float box_area = (box_x2 - box_x1) * (box_y2 - box_y1);

								            float union_area = target_area + box_area - intersection_area;


								            // IoU = intersection / union

								            float iou = union_area > 0 ? intersection_area / union_area : 0;

								            iou_scores[0][i] = iou;

								        }


								        // Move back to original device

								        return iou_scores.to(target_device);

								    }

								}


								// Print model information

								void BBRegressor::print_model_info() {

								    std::cout << "BBRegressor Model Information:" << std::endl;

								    std::cout << "  - Model directory: " << model_dir << std::endl;

								    std::cout << "  - Device: CUDA:" << device.index() << std::endl;

								    std::cout << "  - CUDA Device Count: " << torch::cuda::device_count() << std::endl;

								    std::cout << "  - Using PreciseRoIPooling: " <<

								    #ifdef WITH_PRROI_POOLING

								                   "Yes"

								    #else

								                   "No (will fail)"

								    #endif

								                   << std::endl;

								}


								// Compute statistics for a tensor

								BBRegressor::TensorStats BBRegressor::compute_stats(const torch::Tensor& tensor) {

								    TensorStats stats;


								    // Get shape

								    for (int i = 0; i < tensor.dim(); i++) {

								        stats.shape.push_back(tensor.size(i));

								    }


								    // Compute basic stats - make sure we reduce to scalar values

								    stats.mean = tensor.mean().item<float>();  // Mean of all elements

								    stats.std_dev = tensor.std().item<float>(); // Std dev of all elements

								    stats.min_val = tensor.min().item<float>(); // Min of all elements

								    stats.max_val = tensor.max().item<float>(); // Max of all elements

								    stats.sum = tensor.sum().item<float>();    // Sum of all elements


								    // Sample values at specific positions

								    if (tensor.dim() >= 4) {

								        // For 4D tensors (batch, channel, height, width)

								        stats.samples.push_back(tensor.index({0, 0, 0, 0}).item<float>());


								        if (tensor.size(1) > 1 && tensor.size(2) > 1 && tensor.size(3) > 1) {

								            int mid_c = static_cast<int>(tensor.size(1) / 2);

								            int mid_h = static_cast<int>(tensor.size(2) / 2);

								            int mid_w = static_cast<int>(tensor.size(3) / 2);

								            stats.samples.push_back(tensor.index({0, mid_c, mid_h, mid_w}).item<float>());


								            // Use static_cast to convert int64_t to int to avoid type mismatch

								            int64_t last_c_idx = tensor.size(1) - 1;

								            int64_t last_h_idx = tensor.size(2) - 1;

								            int64_t last_w_idx = tensor.size(3) - 1;


								            // Limit indices to avoid accessing out of bounds

								            if (last_c_idx > 10) last_c_idx = 10;

								            if (last_h_idx > 10) last_h_idx = 10;

								            if (last_w_idx > 10) last_w_idx = 10;


								            stats.samples.push_back(tensor.index({0, static_cast<int>(last_c_idx),

								                                                static_cast<int>(last_h_idx),

								                                                static_cast<int>(last_w_idx)}).item<float>());

								        }

								    } else if (tensor.dim() == 3) {

								        // For 3D tensors

								        stats.samples.push_back(tensor.index({0, 0, 0}).item<float>());


								        if (tensor.size(1) > 1 && tensor.size(2) > 1) {

								            int mid_h = static_cast<int>(tensor.size(1) / 2);

								            int mid_w = static_cast<int>(tensor.size(2) / 2);

								            stats.samples.push_back(tensor.index({0, mid_h, mid_w}).item<float>());


								            int last_h = static_cast<int>(tensor.size(1) - 1);

								            int last_w = static_cast<int>(tensor.size(2) - 1);

								            stats.samples.push_back(tensor.index({0, last_h, last_w}).item<float>());

								        }

								    } else if (tensor.dim() == 2) {

								        // For 2D tensors

								        stats.samples.push_back(tensor.index({0, 0}).item<float>());


								        if (tensor.size(0) > 1 && tensor.size(1) > 1) {

								            int mid_h = static_cast<int>(tensor.size(0) / 2);

								            int mid_w = static_cast<int>(tensor.size(1) / 2);

								            stats.samples.push_back(tensor.index({mid_h, mid_w}).item<float>());


								            int last_h = static_cast<int>(tensor.size(0) - 1);

								            int last_w = static_cast<int>(tensor.size(1) - 1);

								            stats.samples.push_back(tensor.index({last_h, last_w}).item<float>());

								        }

								    } else {

								        // For 1D tensors or scalars

								        if (tensor.numel() > 0) {

								            stats.samples.push_back(tensor.index({0}).item<float>());


								            if (tensor.size(0) > 1) {

								                int mid = static_cast<int>(tensor.size(0) / 2);

								                stats.samples.push_back(tensor.index({mid}).item<float>());


								                int last = static_cast<int>(tensor.size(0) - 1);

								                stats.samples.push_back(tensor.index({last}).item<float>());

								            }

								        }

								    }


								    return stats;

								}


								// Save tensor statistics to a file

								void BBRegressor::save_stats(const std::vector<TensorStats>& all_stats, const std::string& filepath) {

								    std::ofstream file(filepath);

								    if (!file.is_open()) {

								        std::cerr << "Error opening file for writing: " << filepath << std::endl;

								        return;

								    }


								    for (size_t i = 0; i < all_stats.size(); i++) {

								        const auto& stats = all_stats[i];

								        file << "Output " << i << ":" << std::endl;


								        file << "  Shape: [";

								        for (size_t j = 0; j < stats.shape.size(); j++) {

								            file << stats.shape[j];

								            if (j < stats.shape.size() - 1) file << ", ";

								        }

								        file << "]" << std::endl;


								        file << "  Mean: " << stats.mean << std::endl;

								        file << "  Std: " << stats.std_dev << std::endl;

								        file << "  Min: " << stats.min_val << std::endl;

								        file << "  Max: " << stats.max_val << std::endl;

								        file << "  Sum: " << stats.sum << std::endl;


								        file << "  Sample values: [";

								        for (size_t j = 0; j < stats.samples.size(); j++) {

								            file << stats.samples[j];

								            if (j < stats.samples.size() - 1) file << ", ";

								        }

								        file << "]" << std::endl << std::endl;

								    }


								    file.close();

								}