You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

956 lines
40 KiB

#include "bb_regressor.h"
#include <iostream>
#include <fstream>
#include <torch/script.h>
#include <torch/serialize.h>
#include <vector>
#include <stdexcept>
// Add CUDA includes and external function declarations only if not in CPU_ONLY mode
#ifndef CPU_ONLY
// Add CUDA includes
#include <cuda_runtime.h>
#include <ATen/cuda/CUDAContext.h>
// Use the new PrRoIPooling implementation
#include "prroi_pooling_gpu.h"
#include "prroi_pooling_gpu_impl.cuh"
#endif
// PrRoIPool2D implementation with CPU fallback
PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale)
: pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale) {}
torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) {
// Print shape info for debugging
std::cout << " PrRoIPool2D inputs: " << std::endl;
std::cout << " Features: [" << feat.size(0) << ", " << feat.size(1) << ", "
<< feat.size(2) << ", " << feat.size(3) << "]" << std::endl;
std::cout << " ROIs: [" << rois.size(0) << ", " << rois.size(1) << "]" << std::endl;
std::cout << " Pooled size: [" << pooled_height_ << ", " << pooled_width_ << "]" << std::endl;
std::cout << " Spatial scale: " << spatial_scale_ << std::endl;
// Calculate output shape
int channels = feat.size(1);
int num_rois = rois.size(0);
// Create output tensor
auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_},
feat.options());
// Use a simple average pooling as fallback
for (int n = 0; n < num_rois; n++) {
// Get ROI coordinates (batch_idx, x1, y1, x2, y2)
int roi_batch_idx = static_cast<int>(rois[n][0].item<float>());
float roi_x1 = rois[n][1].item<float>() * spatial_scale_;
float roi_y1 = rois[n][2].item<float>() * spatial_scale_;
float roi_x2 = rois[n][3].item<float>() * spatial_scale_;
float roi_y2 = rois[n][4].item<float>() * spatial_scale_;
// Skip invalid ROIs
if (roi_batch_idx < 0) continue;
// Force ROI bounds within image
int img_height = feat.size(2);
int img_width = feat.size(3);
roi_x1 = std::max(0.0f, std::min(static_cast<float>(img_width - 1), roi_x1));
roi_y1 = std::max(0.0f, std::min(static_cast<float>(img_height - 1), roi_y1));
roi_x2 = std::max(0.0f, std::min(static_cast<float>(img_width - 1), roi_x2));
roi_y2 = std::max(0.0f, std::min(static_cast<float>(img_height - 1), roi_y2));
// Convert to integers for pooling
int x1 = static_cast<int>(roi_x1);
int y1 = static_cast<int>(roi_y1);
int x2 = static_cast<int>(ceil(roi_x2));
int y2 = static_cast<int>(ceil(roi_y2));
// Calculate bin sizes
float bin_width = (roi_x2 - roi_x1) / pooled_width_;
float bin_height = (roi_y2 - roi_y1) / pooled_height_;
// Perform pooling for each output location
for (int ph = 0; ph < pooled_height_; ph++) {
for (int pw = 0; pw < pooled_width_; pw++) {
// Compute bin boundaries
int hstart = static_cast<int>(roi_y1 + ph * bin_height);
int wstart = static_cast<int>(roi_x1 + pw * bin_width);
int hend = static_cast<int>(ceil(roi_y1 + (ph + 1) * bin_height));
int wend = static_cast<int>(ceil(roi_x1 + (pw + 1) * bin_width));
// Clip to image boundaries
hstart = std::max(0, std::min(img_height - 1, hstart));
wstart = std::max(0, std::min(img_width - 1, wstart));
hend = std::max(0, std::min(img_height, hend));
wend = std::max(0, std::min(img_width, wend));
// Skip empty bins
if (hend <= hstart || wend <= wstart) continue;
// Calculate pool size
int pool_size = (hend - hstart) * (wend - wstart);
// For each channel, perform pooling
for (int c = 0; c < channels; c++) {
float sum = 0.0f;
// Sum over the bin area
for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) {
sum += feat[roi_batch_idx][c][h][w].item<float>();
}
}
// Average pooling
if (pool_size > 0) {
output[n][c][ph][pw] = sum / pool_size;
}
}
}
}
}
return output;
}
// LinearBlock implementation
LinearBlock::LinearBlock(int in_planes, int out_planes, int input_sz, bool bias, bool batch_norm, bool relu) {
// Create the linear layer with proper input dimensions
auto linear_options = torch::nn::LinearOptions(in_planes * input_sz * input_sz, out_planes).bias(bias);
linear = register_module("linear", torch::nn::Linear(linear_options));
use_bn = batch_norm;
if (use_bn) {
// Important: use BatchNorm2d to match Python implementation
bn = register_module("bn", torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes)));
}
use_relu = relu;
if (use_relu) {
relu_ = register_module("relu", torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true)));
}
}
torch::Tensor LinearBlock::forward(torch::Tensor x) {
// Store original dtype for later
auto original_dtype = x.dtype();
// Use double precision for higher accuracy
auto x_double = x.to(torch::kFloat64);
// Reshape exactly as in Python: x.reshape(x.shape[0], -1)
x_double = x_double.reshape({x_double.size(0), -1}).contiguous();
// Convert back to original precision for the linear operation
auto x_float = x_double.to(original_dtype);
x_float = linear->forward(x_float);
// Back to double precision for further operations
x_double = x_float.to(torch::kFloat64);
if (use_bn) {
// This is crucial: reshape to 4D tensor for BatchNorm2d exactly as in Python
// In Python: x = self.bn(x.reshape(x.shape[0], x.shape[1], 1, 1))
x_double = x_double.reshape({x_double.size(0), x_double.size(1), 1, 1}).contiguous();
// Apply batch norm (convert to float32 for the operation)
x_float = x_double.to(original_dtype);
x_float = bn->forward(x_float);
x_double = x_float.to(torch::kFloat64);
}
// Apply ReLU if needed
if (use_relu) {
// Apply ReLU in float32 precision
x_float = x_double.to(original_dtype);
x_float = relu_->forward(x_float);
x_double = x_float.to(torch::kFloat64);
}
// Final reshape to 2D tensor, exactly matching Python's behavior
x_double = x_double.reshape({x_double.size(0), -1}).contiguous();
// Return tensor in original precision
return x_double.to(original_dtype);
}
// Create convolutional block
torch::nn::Sequential BBRegressor::create_conv_block(int in_planes, int out_planes,
int kernel_size, int stride,
int padding, int dilation) {
// Print dimensions for debugging
std::cout << "Creating conv block: in_planes=" << in_planes << ", out_planes=" << out_planes << std::endl;
torch::nn::Sequential seq;
// Add convolutional layer
seq->push_back(torch::nn::Conv2d(torch::nn::Conv2dOptions(in_planes, out_planes, kernel_size)
.stride(stride).padding(padding).dilation(dilation).bias(true)));
// Add batch normalization layer
seq->push_back(torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes)));
// Add ReLU activation
seq->push_back(torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true)));
return seq;
}
// Helper function to verify BatchNorm dimensions
void BBRegressor::verify_batchnorm_dimensions() {
std::cout << "Verifying BatchNorm dimensions..." << std::endl;
// Get children of conv3_1r
std::cout << "conv3_1r has " << conv3_1r->size() << " modules" << std::endl;
if (conv3_1r->size() > 1) {
auto module = conv3_1r[1];
std::cout << "conv3_1r module[1] type: " << module->name() << std::endl;
}
// Get children of conv3_1t
std::cout << "conv3_1t has " << conv3_1t->size() << " modules" << std::endl;
if (conv3_1t->size() > 1) {
auto module = conv3_1t[1];
std::cout << "conv3_1t module[1] type: " << module->name() << std::endl;
}
// Get children of conv3_2t
std::cout << "conv3_2t has " << conv3_2t->size() << " modules" << std::endl;
if (conv3_2t->size() > 1) {
auto module = conv3_2t[1];
std::cout << "conv3_2t module[1] type: " << module->name() << std::endl;
}
}
// Helper function to read file to bytes
std::vector<char> BBRegressor::read_file_to_bytes(const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
throw std::runtime_error("Could not open file: " + file_path);
}
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> buffer(size);
if (!file.read(buffer.data(), size)) {
throw std::runtime_error("Could not read file: " + file_path);
}
return buffer;
}
// Load tensor from file
torch::Tensor BBRegressor::load_tensor(const std::string& file_path) {
try {
// Read file into bytes first
std::vector<char> data = read_file_to_bytes(file_path);
// Use pickle_load with byte data
torch::Tensor tensor = torch::pickle_load(data).toTensor();
// Always move tensor to the specified device
if (tensor.device() != device) {
tensor = tensor.to(device);
}
return tensor;
} catch (const std::exception& e) {
std::cerr << "Error loading tensor from " << file_path << ": " << e.what() << std::endl;
throw;
}
}
// Constructor
BBRegressor::BBRegressor(const std::string& base_dir, torch::Device dev)
: device(dev), model_dir(base_dir + "/exported_weights/bb_regressor"),
fc3_rt(256, 256, 5, true, true, true),
fc4_rt(256, 256, 3, true, true, true) {
// Check if base directory exists
if (!fs::exists(base_dir)) {
throw std::runtime_error("Base directory does not exist: " + base_dir);
}
// Check if model directory exists
if (!fs::exists(model_dir)) {
throw std::runtime_error("Model directory does not exist: " + model_dir);
}
// Initialize convolution blocks - match Python's AtomIoUNet implementation exactly
std::cout << "Initializing conv blocks..." << std::endl;
// In Python: self.conv3_1r = conv(input_dim[0], 128, kernel_size=3, stride=1)
conv3_1r = create_conv_block(512, 128, 3, 1, 1, 1);
// In Python: self.conv3_1t = conv(input_dim[0], 256, kernel_size=3, stride=1)
conv3_1t = create_conv_block(512, 256, 3, 1, 1, 1);
// In Python: self.conv3_2t = conv(256, pred_input_dim[0], kernel_size=3, stride=1)
conv3_2t = create_conv_block(256, 256, 3, 1, 1, 1);
// Update pooling sizes to match the Python model exactly
// In Python: self.prroi_pool3r = PrRoIPool2D(3, 3, 1/8)
prroi_pool3r = std::make_shared<PrRoIPool2D>(3, 3, 0.125); // 1/8 scale for layer2
// In Python: self.prroi_pool3t = PrRoIPool2D(5, 5, 1/8)
prroi_pool3t = std::make_shared<PrRoIPool2D>(5, 5, 0.125); // 1/8 scale for layer2
// Create sequential blocks
// In Python: self.fc3_1r = conv(128, 256, kernel_size=3, stride=1, padding=0)
fc3_1r = create_conv_block(128, 256, 3, 1, 0, 1); // padding=0 for this layer
// In Python: self.conv4_1r = conv(input_dim[1], 256, kernel_size=3, stride=1)
conv4_1r = create_conv_block(1024, 256, 3, 1, 1, 1);
// In Python: self.conv4_1t = conv(input_dim[1], 256, kernel_size=3, stride=1)
conv4_1t = create_conv_block(1024, 256, 3, 1, 1, 1);
// In Python: self.conv4_2t = conv(256, pred_input_dim[1], kernel_size=3, stride=1)
conv4_2t = create_conv_block(256, 256, 3, 1, 1, 1);
// In Python: self.prroi_pool4r = PrRoIPool2D(1, 1, 1/16)
prroi_pool4r = std::make_shared<PrRoIPool2D>(1, 1, 0.0625); // 1/16 scale for layer3
// In Python: self.prroi_pool4t = PrRoIPool2D(3, 3, 1/16)
prroi_pool4t = std::make_shared<PrRoIPool2D>(3, 3, 0.0625); // 1/16 scale for layer3
// In Python: self.fc34_3r = conv(256 + 256, pred_input_dim[0], kernel_size=1, stride=1, padding=0)
fc34_3r = create_conv_block(512, 256, 1, 1, 0, 1); // kernel_size=1, padding=0
// In Python: self.fc34_4r = conv(256 + 256, pred_input_dim[1], kernel_size=1, stride=1, padding=0)
fc34_4r = create_conv_block(512, 256, 1, 1, 0, 1); // kernel_size=1, padding=0
// Linear blocks - exactly match Python's implementation dimensions and parameters
// In Python: self.fc3_rt = LinearBlock(pred_input_dim[0], pred_inter_dim[0], 5)
fc3_rt = LinearBlock(256, 256, 5, true, true, true);
// In Python: self.fc4_rt = LinearBlock(pred_input_dim[1], pred_inter_dim[1], 3)
fc4_rt = LinearBlock(256, 256, 3, true, true, true);
// In Python: self.iou_predictor = nn.Linear(pred_inter_dim[0]+pred_inter_dim[1], 1, bias=True)
iou_predictor = torch::nn::Linear(torch::nn::LinearOptions(256 + 256, 1).bias(true));
// Load all weights
load_weights();
// Set the model to evaluation mode
this->eval();
// Debug information
std::cout << "BB Regressor initialized in evaluation mode" << std::endl;
}
// Set the model to evaluation mode
void BBRegressor::eval() {
// Set all sequential modules to eval mode
conv3_1r->eval();
conv3_1t->eval();
conv3_2t->eval();
fc3_1r->eval();
conv4_1r->eval();
conv4_1t->eval();
conv4_2t->eval();
fc34_3r->eval();
fc34_4r->eval();
// Linear blocks also need to be set to eval mode for BatchNorm layers
fc3_rt.eval();
fc4_rt.eval();
// Set linear layers to eval mode (though this usually doesn't have any effect)
iou_predictor->eval();
}
// Load weights
void BBRegressor::load_weights() {
// Helper lambda to load weights for a sequential module
auto load_sequential_weights = [this](torch::nn::Sequential& seq, const std::string& prefix) {
try {
// Load weights for conv layer (index 0)
std::string weight_path = model_dir + "/" + prefix + "_0_weight.pt";
std::string bias_path = model_dir + "/" + prefix + "_0_bias.pt";
if (fs::exists(weight_path) && fs::exists(bias_path)) {
auto conv_weight = load_tensor(weight_path);
auto conv_bias = load_tensor(bias_path);
// Get the conv2d module from sequential
// Fix: Get the number of output channels from the weight tensor
int out_channels = conv_weight.size(0);
int in_channels = conv_weight.size(1);
int kernel_size = conv_weight.size(2);
std::cout << "Loading " << prefix << " conv weights: "
<< "[out_ch=" << out_channels
<< ", in_ch=" << in_channels
<< ", kernel=" << kernel_size << "]" << std::endl;
// FIXED: Use the correct padding based on the layer name
int padding = 1; // Default padding
// Special cases for layers with different padding
if (prefix == "fc3_1r" || prefix == "fc34_3r" || prefix == "fc34_4r") {
padding = 0; // These layers use padding=0 in the Python implementation
}
std::cout << " Using padding=" << padding << " for " << prefix << std::endl;
auto conv_options = torch::nn::Conv2dOptions(in_channels, out_channels, kernel_size)
.stride(1).padding(padding).bias(true);
auto conv_module = torch::nn::Conv2d(conv_options);
// Set weights and bias
conv_module->weight = conv_weight;
conv_module->bias = conv_bias;
// Debug info - print some weight stats
std::cout << " Conv weight stats: mean=" << conv_weight.mean().item<float>()
<< ", std=" << conv_weight.std().item<float>()
<< ", min=" << conv_weight.min().item<float>()
<< ", max=" << conv_weight.max().item<float>() << std::endl;
// Create a new sequence with the proper conv module
auto new_seq = torch::nn::Sequential();
new_seq->push_back(conv_module);
// Load batch norm parameters (index 1)
std::string bn_weight_path = model_dir + "/" + prefix + "_1_weight.pt";
std::string bn_bias_path = model_dir + "/" + prefix + "_1_bias.pt";
std::string bn_mean_path = model_dir + "/" + prefix + "_1_running_mean.pt";
std::string bn_var_path = model_dir + "/" + prefix + "_1_running_var.pt";
if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) &&
fs::exists(bn_mean_path) && fs::exists(bn_var_path)) {
auto bn_weight = load_tensor(bn_weight_path);
auto bn_bias = load_tensor(bn_bias_path);
auto bn_mean = load_tensor(bn_mean_path);
auto bn_var = load_tensor(bn_var_path);
// Important: Create BatchNorm with the correct number of features from the weights
int num_features = bn_weight.size(0);
std::cout << " Creating BatchNorm2d with num_features=" << num_features << std::endl;
// Create a proper batch norm module with the right number of features
auto bn_options = torch::nn::BatchNorm2dOptions(num_features)
.eps(1e-5) // Match Python default
.momentum(0.1) // Match Python default
.affine(true)
.track_running_stats(true);
auto bn_module = torch::nn::BatchNorm2d(bn_options);
// Set batch norm parameters
bn_module->weight = bn_weight;
bn_module->bias = bn_bias;
bn_module->running_mean = bn_mean;
bn_module->running_var = bn_var;
// Debug info - print some batch norm stats
std::cout << " BN weight stats: mean=" << bn_weight.mean().item<float>()
<< ", std=" << bn_weight.std().item<float>() << std::endl;
std::cout << " BN running_mean stats: mean=" << bn_mean.mean().item<float>()
<< ", std=" << bn_mean.std().item<float>() << std::endl;
std::cout << " BN running_var stats: mean=" << bn_var.mean().item<float>()
<< ", std=" << bn_var.std().item<float>() << std::endl;
// Add the batch norm module to the sequence
new_seq->push_back(bn_module);
}
// Add the ReLU module with inplace=true to match Python
auto relu_options = torch::nn::ReLUOptions().inplace(true);
new_seq->push_back(torch::nn::ReLU(relu_options));
// Replace the old sequence with the new one
seq = new_seq;
std::cout << "Loaded weights for " << prefix << std::endl;
} else {
std::cerr << "Weight files not found for " << prefix << std::endl;
}
} catch (const std::exception& e) {
std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl;
throw; // Re-throw to stop execution
}
};
// Load weights for linear blocks
auto load_linear_block_weights = [this](LinearBlock& block, const std::string& prefix) {
try {
// Load weights for linear layer
std::string weight_path = model_dir + "/" + prefix + "_linear_weight.pt";
std::string bias_path = model_dir + "/" + prefix + "_linear_bias.pt";
if (fs::exists(weight_path) && fs::exists(bias_path)) {
auto linear_weight = load_tensor(weight_path);
auto linear_bias = load_tensor(bias_path);
// Set weights and bias
block.linear->weight = linear_weight;
block.linear->bias = linear_bias;
// Load batch norm parameters
std::string bn_weight_path = model_dir + "/" + prefix + "_bn_weight.pt";
std::string bn_bias_path = model_dir + "/" + prefix + "_bn_bias.pt";
std::string bn_mean_path = model_dir + "/" + prefix + "_bn_running_mean.pt";
std::string bn_var_path = model_dir + "/" + prefix + "_bn_running_var.pt";
if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) &&
fs::exists(bn_mean_path) && fs::exists(bn_var_path)) {
auto bn_weight = load_tensor(bn_weight_path);
auto bn_bias = load_tensor(bn_bias_path);
auto bn_mean = load_tensor(bn_mean_path);
auto bn_var = load_tensor(bn_var_path);
// Set batch norm parameters
block.bn->weight = bn_weight;
block.bn->bias = bn_bias;
block.bn->running_mean = bn_mean;
block.bn->running_var = bn_var;
}
std::cout << "Loaded weights for " << prefix << std::endl;
} else {
std::cerr << "Weight files not found for " << prefix << std::endl;
}
} catch (const std::exception& e) {
std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl;
throw; // Re-throw to stop execution
}
};
// Load weights for all layers
load_sequential_weights(conv3_1r, "conv3_1r");
load_sequential_weights(conv3_1t, "conv3_1t");
load_sequential_weights(conv3_2t, "conv3_2t");
load_sequential_weights(fc3_1r, "fc3_1r");
load_sequential_weights(conv4_1r, "conv4_1r");
load_sequential_weights(conv4_1t, "conv4_1t");
load_sequential_weights(conv4_2t, "conv4_2t");
load_sequential_weights(fc34_3r, "fc34_3r");
load_sequential_weights(fc34_4r, "fc34_4r");
load_linear_block_weights(fc3_rt, "fc3_rt");
load_linear_block_weights(fc4_rt, "fc4_rt");
// Load IoU predictor weights
try {
std::string weight_path = model_dir + "/iou_predictor_weight.pt";
std::string bias_path = model_dir + "/iou_predictor_bias.pt";
if (fs::exists(weight_path) && fs::exists(bias_path)) {
auto weight = load_tensor(weight_path);
auto bias = load_tensor(bias_path);
iou_predictor->weight = weight;
iou_predictor->bias = bias;
std::cout << "Loaded weights for iou_predictor" << std::endl;
} else {
std::cerr << "Weight files not found for iou_predictor" << std::endl;
}
} catch (const std::exception& e) {
std::cerr << "Error loading weights for iou_predictor: " << e.what() << std::endl;
throw; // Re-throw to stop execution
}
}
// Move model to device
void BBRegressor::to(torch::Device device) {
// Verify the device is a CUDA device
if (!device.is_cuda()) {
throw std::runtime_error("BBRegressor requires a CUDA device");
}
this->device = device;
// Move all components to device
conv3_1r->to(device);
conv3_1t->to(device);
conv3_2t->to(device);
fc3_1r->to(device);
conv4_1r->to(device);
conv4_1t->to(device);
conv4_2t->to(device);
fc3_rt.to(device);
fc4_rt.to(device);
iou_predictor->to(device);
}
// Get IoU features from backbone features
std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat2) {
// Convert to double precision for better numerical stability
auto feat2_double0 = feat2[0].to(torch::kFloat64);
auto feat2_double1 = feat2[1].to(torch::kFloat64);
// Reshape exactly as in Python implementation
// In Python: feat2 = [f.reshape(-1, *f.shape[-3:]) if f.dim()==5 else f for f in feat2]
if (feat2_double0.dim() == 5) {
auto shape = feat2_double0.sizes();
feat2_double0 = feat2_double0.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
}
if (feat2_double1.dim() == 5) {
auto shape = feat2_double1.sizes();
feat2_double1 = feat2_double1.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
}
// Convert back to float32 for convolution operations
feat2[0] = feat2_double0.to(torch::kFloat32).contiguous();
feat2[1] = feat2_double1.to(torch::kFloat32).contiguous();
// Apply convolutions exactly as in Python
torch::Tensor feat3_t = feat2[0];
torch::Tensor feat4_t = feat2[1];
// Ensure we're in evaluation mode
torch::NoGradGuard no_grad;
// Apply convolutions just like Python version
torch::Tensor c3_t_1 = conv3_1t->forward(feat3_t);
c3_t_1 = c3_t_1.contiguous();
torch::Tensor c3_t = conv3_2t->forward(c3_t_1);
c3_t = c3_t.contiguous();
torch::Tensor c4_t_1 = conv4_1t->forward(feat4_t);
c4_t_1 = c4_t_1.contiguous();
torch::Tensor c4_t = conv4_2t->forward(c4_t_1);
c4_t = c4_t.contiguous();
// Return results
return {c3_t, c4_t};
}
// Get modulation vectors for the target
std::vector<torch::Tensor> BBRegressor::get_modulation(std::vector<torch::Tensor> feat, torch::Tensor bb) {
// Convert to double precision for better numerical stability
auto feat0_double = feat[0].to(torch::kFloat64);
auto feat1_double = feat[1].to(torch::kFloat64);
auto bb_double = bb.to(torch::kFloat64);
// Handle 5D tensors exactly like Python implementation
if (feat0_double.dim() == 5) {
auto shape = feat0_double.sizes();
feat0_double = feat0_double.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
}
if (feat1_double.dim() == 5) {
auto shape = feat1_double.sizes();
feat1_double = feat1_double.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
}
// Convert back to float32 for convolution operations
feat[0] = feat0_double.to(torch::kFloat32).contiguous();
feat[1] = feat1_double.to(torch::kFloat32).contiguous();
bb = bb_double.to(torch::kFloat32).contiguous();
torch::Tensor feat3_r = feat[0];
torch::Tensor feat4_r = feat[1];
// Disable gradients for evaluation
torch::NoGradGuard no_grad;
// Apply convolutions
torch::Tensor c3_r = conv3_1r->forward(feat3_r);
c3_r = c3_r.contiguous();
// Convert bb from xywh to x0y0x1y1 format with high precision
auto bb_clone = bb.clone();
bb_double = bb_clone.to(torch::kFloat64);
auto xy = bb_double.index({torch::indexing::Slice(), torch::indexing::Slice(0, 2)});
auto wh = bb_double.index({torch::indexing::Slice(), torch::indexing::Slice(2, 4)});
bb_double.index_put_({torch::indexing::Slice(), torch::indexing::Slice(2, 4)}, xy + wh);
bb_clone = bb_double.to(torch::kFloat32);
// Add batch_index to rois - match Python implementation exactly
int batch_size = bb.size(0);
auto batch_index = torch::arange(batch_size, torch::kFloat32).reshape({-1, 1}).to(bb.device());
auto roi1 = torch::cat({batch_index, bb_clone}, /*dim=*/1).contiguous();
// Apply RoI pooling
torch::Tensor roi3r = prroi_pool3r->forward(c3_r, roi1);
roi3r = roi3r.contiguous();
torch::Tensor c4_r = conv4_1r->forward(feat4_r);
c4_r = c4_r.contiguous();
torch::Tensor roi4r = prroi_pool4r->forward(c4_r, roi1);
roi4r = roi4r.contiguous();
torch::Tensor fc3_r = fc3_1r->forward(roi3r);
fc3_r = fc3_r.contiguous();
// Concatenate with higher precision
auto fc3_r_double = fc3_r.to(torch::kFloat64);
auto roi4r_double = roi4r.to(torch::kFloat64);
auto fc34_r_double = torch::cat({fc3_r_double, roi4r_double}, /*dim=*/1);
auto fc34_r = fc34_r_double.to(torch::kFloat32).contiguous();
// Apply final convolutions
torch::Tensor fc34_3_r = fc34_3r->forward(fc34_r);
fc34_3_r = fc34_3_r.contiguous();
torch::Tensor fc34_4_r = fc34_4r->forward(fc34_r);
fc34_4_r = fc34_4_r.contiguous();
return {fc34_3_r, fc34_4_r};
}
// Predict IoU for proposals
torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation,
std::vector<torch::Tensor> feat,
torch::Tensor proposals) {
try {
// Convert to double precision for better numerical stability
auto modulation0_double = modulation[0].to(torch::kFloat64);
auto modulation1_double = modulation[1].to(torch::kFloat64);
auto feat0_double = feat[0].to(torch::kFloat64);
auto feat1_double = feat[1].to(torch::kFloat64);
auto proposals_double = proposals.to(torch::kFloat64);
// Extract modulation vectors and features
torch::Tensor fc34_3_r = modulation0_double;
torch::Tensor fc34_4_r = modulation1_double;
torch::Tensor c3_t = feat0_double;
torch::Tensor c4_t = feat1_double;
// Ensure proper shapes with contiguous memory
fc34_3_r = fc34_3_r.contiguous();
fc34_4_r = fc34_4_r.contiguous();
c3_t = c3_t.contiguous();
c4_t = c4_t.contiguous();
proposals = proposals_double.to(torch::kFloat32).contiguous();
int batch_size = c3_t.size(0);
int num_proposals_per_batch = proposals.size(1);
// Reshape modulation vectors exactly like Python implementation
torch::Tensor fc34_3_r_reshaped;
if (fc34_3_r.dim() == 2) {
fc34_3_r_reshaped = fc34_3_r.reshape({batch_size, -1, 1, 1});
} else if (fc34_3_r.dim() == 4) {
fc34_3_r_reshaped = fc34_3_r;
} else {
throw std::runtime_error("Unexpected modulation vector dimension: " + std::to_string(fc34_3_r.dim()));
}
torch::Tensor fc34_4_r_reshaped;
if (fc34_4_r.dim() == 2) {
fc34_4_r_reshaped = fc34_4_r.reshape({batch_size, -1, 1, 1});
} else if (fc34_4_r.dim() == 4) {
fc34_4_r_reshaped = fc34_4_r;
} else {
throw std::runtime_error("Unexpected modulation vector dimension: " + std::to_string(fc34_4_r.dim()));
}
// Element-wise multiplication for modulation
auto c3_t_att_double = c3_t * fc34_3_r_reshaped;
auto c4_t_att_double = c4_t * fc34_4_r_reshaped;
// Convert back to float32 for ROI pooling operations
auto c3_t_att = c3_t_att_double.to(torch::kFloat32).contiguous();
auto c4_t_att = c4_t_att_double.to(torch::kFloat32).contiguous();
// Add batch index to ROIs
auto batch_index = torch::arange(batch_size, torch::kFloat32).reshape({-1, 1}).to(c3_t.device());
// Convert proposals from xywh to x0y0x1y1 format with high precision
proposals_double = proposals.to(torch::kFloat64);
auto proposals_xy = proposals_double.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(0, 2)});
auto proposals_wh = proposals_double.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(2, 4)});
auto proposals_xyxy = torch::cat({
proposals_xy,
proposals_xy + proposals_wh
}, /*dim=*/2).contiguous();
// Add batch index - match Python exactly
auto batch_idx_expanded = batch_index.reshape({batch_size, -1, 1}).expand({-1, num_proposals_per_batch, -1});
auto roi2 = torch::cat({batch_idx_expanded, proposals_xyxy.to(torch::kFloat32)}, /*dim=*/2);
roi2 = roi2.reshape({-1, 5}).to(proposals_xyxy.device()).contiguous();
// Apply ROI pooling
torch::Tensor roi3t = prroi_pool3t->forward(c3_t_att, roi2);
roi3t = roi3t.contiguous();
torch::Tensor roi4t = prroi_pool4t->forward(c4_t_att, roi2);
roi4t = roi4t.contiguous();
// Apply linear blocks
torch::Tensor fc3_rt_out = fc3_rt.forward(roi3t);
torch::Tensor fc4_rt_out = fc4_rt.forward(roi4t);
// Concatenate features with high precision
auto fc3_rt_out_double = fc3_rt_out.to(torch::kFloat64);
auto fc4_rt_out_double = fc4_rt_out.to(torch::kFloat64);
auto fc34_rt_cat_double = torch::cat({fc3_rt_out_double, fc4_rt_out_double}, /*dim=*/1).contiguous();
// Final prediction with high precision
auto fc34_rt_cat_float = fc34_rt_cat_double.to(torch::kFloat32);
// Try CPU path if we have issues with CUDA
if (fc34_rt_cat_float.device().is_cuda()) {
try {
auto iou_pred_double = iou_predictor->forward(fc34_rt_cat_float).to(torch::kFloat64);
iou_pred_double = iou_pred_double.reshape({batch_size, num_proposals_per_batch}).contiguous();
return iou_pred_double.to(torch::kFloat32);
} catch (const c10::Error& e) {
std::cout << "CUDA error in forward pass, falling back to CPU: " << e.what() << std::endl;
// Fall back to CPU
fc34_rt_cat_float = fc34_rt_cat_float.to(torch::kCPU);
}
}
// CPU path
auto iou_pred_double = iou_predictor->forward(fc34_rt_cat_float).to(torch::kFloat64);
iou_pred_double = iou_pred_double.reshape({batch_size, num_proposals_per_batch}).contiguous();
return iou_pred_double.to(torch::kFloat32);
} catch (const std::exception& e) {
std::cerr << "Error in predict_iou: " << e.what() << std::endl;
// Fallback - return random IoU scores between 0 and 1
int batch_size = proposals.size(0);
int num_proposals = proposals.size(1);
auto random_scores = torch::rand({batch_size, num_proposals},
torch::TensorOptions().device(torch::kCPU));
std::cout << "Returning random fallback IoU scores" << std::endl;
return random_scores;
}
}
// Print model information
void BBRegressor::print_model_info() {
std::cout << "BBRegressor Model Information:" << std::endl;
std::cout << " - Model directory: " << model_dir << std::endl;
std::cout << " - Device: CUDA:" << device.index() << std::endl;
std::cout << " - CUDA Device Count: " << torch::cuda::device_count() << std::endl;
std::cout << " - Using PreciseRoIPooling: " <<
#ifdef WITH_PRROI_POOLING
"Yes"
#else
"No (will fail)"
#endif
<< std::endl;
}
// Compute statistics for a tensor
BBRegressor::TensorStats BBRegressor::compute_stats(const torch::Tensor& tensor) {
TensorStats stats;
// Get shape
for (int i = 0; i < tensor.dim(); i++) {
stats.shape.push_back(tensor.size(i));
}
// Compute basic stats - make sure we reduce to scalar values
stats.mean = tensor.mean().item<float>(); // Mean of all elements
stats.std_dev = tensor.std().item<float>(); // Std dev of all elements
stats.min_val = tensor.min().item<float>(); // Min of all elements
stats.max_val = tensor.max().item<float>(); // Max of all elements
stats.sum = tensor.sum().item<float>(); // Sum of all elements
// Sample values at specific positions
if (tensor.dim() >= 4) {
// For 4D tensors (batch, channel, height, width)
stats.samples.push_back(tensor.index({0, 0, 0, 0}).item<float>());
if (tensor.size(1) > 1 && tensor.size(2) > 1 && tensor.size(3) > 1) {
int mid_c = static_cast<int>(tensor.size(1) / 2);
int mid_h = static_cast<int>(tensor.size(2) / 2);
int mid_w = static_cast<int>(tensor.size(3) / 2);
stats.samples.push_back(tensor.index({0, mid_c, mid_h, mid_w}).item<float>());
// Use static_cast to convert int64_t to int to avoid type mismatch
int64_t last_c_idx = tensor.size(1) - 1;
int64_t last_h_idx = tensor.size(2) - 1;
int64_t last_w_idx = tensor.size(3) - 1;
// Limit indices to avoid accessing out of bounds
if (last_c_idx > 10) last_c_idx = 10;
if (last_h_idx > 10) last_h_idx = 10;
if (last_w_idx > 10) last_w_idx = 10;
stats.samples.push_back(tensor.index({0, static_cast<int>(last_c_idx),
static_cast<int>(last_h_idx),
static_cast<int>(last_w_idx)}).item<float>());
}
} else if (tensor.dim() == 3) {
// For 3D tensors
stats.samples.push_back(tensor.index({0, 0, 0}).item<float>());
if (tensor.size(1) > 1 && tensor.size(2) > 1) {
int mid_h = static_cast<int>(tensor.size(1) / 2);
int mid_w = static_cast<int>(tensor.size(2) / 2);
stats.samples.push_back(tensor.index({0, mid_h, mid_w}).item<float>());
int last_h = static_cast<int>(tensor.size(1) - 1);
int last_w = static_cast<int>(tensor.size(2) - 1);
stats.samples.push_back(tensor.index({0, last_h, last_w}).item<float>());
}
} else if (tensor.dim() == 2) {
// For 2D tensors
stats.samples.push_back(tensor.index({0, 0}).item<float>());
if (tensor.size(0) > 1 && tensor.size(1) > 1) {
int mid_h = static_cast<int>(tensor.size(0) / 2);
int mid_w = static_cast<int>(tensor.size(1) / 2);
stats.samples.push_back(tensor.index({mid_h, mid_w}).item<float>());
int last_h = static_cast<int>(tensor.size(0) - 1);
int last_w = static_cast<int>(tensor.size(1) - 1);
stats.samples.push_back(tensor.index({last_h, last_w}).item<float>());
}
} else {
// For 1D tensors or scalars
if (tensor.numel() > 0) {
stats.samples.push_back(tensor.index({0}).item<float>());
if (tensor.size(0) > 1) {
int mid = static_cast<int>(tensor.size(0) / 2);
stats.samples.push_back(tensor.index({mid}).item<float>());
int last = static_cast<int>(tensor.size(0) - 1);
stats.samples.push_back(tensor.index({last}).item<float>());
}
}
}
return stats;
}
// Save tensor statistics to a file
void BBRegressor::save_stats(const std::vector<TensorStats>& all_stats, const std::string& filepath) {
std::ofstream file(filepath);
if (!file.is_open()) {
std::cerr << "Error opening file for writing: " << filepath << std::endl;
return;
}
for (size_t i = 0; i < all_stats.size(); i++) {
const auto& stats = all_stats[i];
file << "Output " << i << ":" << std::endl;
file << " Shape: [";
for (size_t j = 0; j < stats.shape.size(); j++) {
file << stats.shape[j];
if (j < stats.shape.size() - 1) file << ", ";
}
file << "]" << std::endl;
file << " Mean: " << stats.mean << std::endl;
file << " Std: " << stats.std_dev << std::endl;
file << " Min: " << stats.min_val << std::endl;
file << " Max: " << stats.max_val << std::endl;
file << " Sum: " << stats.sum << std::endl;
file << " Sample values: [";
for (size_t j = 0; j < stats.samples.size(); j++) {
file << stats.samples[j];
if (j < stats.samples.size() - 1) file << ", ";
}
file << "]" << std::endl << std::endl;
}
file.close();
}