You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
928 lines
41 KiB
928 lines
41 KiB
#include "bb_regressor.h"
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <torch/script.h>
|
|
#include <torch/serialize.h>
|
|
#include <vector>
|
|
#include <stdexcept>
|
|
// Add CUDA includes for required CUDA implementation
|
|
#include <cuda_runtime.h>
|
|
#include <ATen/cuda/CUDAContext.h>
|
|
|
|
// Use the PrRoIPooling implementation
|
|
#include "prroi_pooling_gpu.h"
|
|
#include "prroi_pooling_gpu_impl.cuh"
|
|
|
|
// PrRoIPool2D implementation (requires CUDA)
|
|
PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale)
|
|
: pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale) {}
|
|
|
|
torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) {
|
|
// Print shape info for debugging
|
|
std::cout << " PrRoIPool2D inputs: " << std::endl;
|
|
std::cout << " Features: [" << feat.size(0) << ", " << feat.size(1) << ", "
|
|
<< feat.size(2) << ", " << feat.size(3) << "]" << std::endl;
|
|
std::cout << " ROIs: [" << rois.size(0) << ", " << rois.size(1) << "]" << std::endl;
|
|
std::cout << " Pooled size: [" << pooled_height_ << ", " << pooled_width_ << "]" << std::endl;
|
|
std::cout << " Spatial scale: " << spatial_scale_ << std::endl;
|
|
|
|
// Calculate output shape
|
|
int channels = feat.size(1);
|
|
int num_rois = rois.size(0);
|
|
|
|
// Ensure both tensors are on CUDA
|
|
if (!feat.is_cuda() || !rois.is_cuda()) {
|
|
throw std::runtime_error("PrRoIPool2D requires CUDA tensors - CPU mode is not supported");
|
|
}
|
|
|
|
// Print ROI values for debugging
|
|
std::cout << " ROI values: " << std::endl;
|
|
for (int i = 0; i < std::min(num_rois, 3); i++) {
|
|
std::cout << " ROI " << i << ": [";
|
|
for (int j = 0; j < rois.size(1); j++) {
|
|
std::cout << rois[i][j].item<float>();
|
|
if (j < rois.size(1) - 1) std::cout << ", ";
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
}
|
|
|
|
// Create output tensor on the same device
|
|
auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_},
|
|
feat.options());
|
|
|
|
// Copy tensors to CPU for the C implementation
|
|
auto feat_cpu = feat.to(torch::kCPU).contiguous();
|
|
auto rois_cpu = rois.to(torch::kCPU).contiguous();
|
|
auto output_cpu = output.to(torch::kCPU).contiguous();
|
|
|
|
// Call the C wrapper function
|
|
std::cout << " Calling prroi_pooling_forward_cuda..." << std::endl;
|
|
prroi_pooling_forward_cuda(
|
|
feat_cpu.data_ptr<float>(),
|
|
static_cast<float*>(rois_cpu.data_ptr()),
|
|
static_cast<float*>(output_cpu.data_ptr()),
|
|
channels,
|
|
feat.size(2),
|
|
feat.size(3),
|
|
num_rois,
|
|
pooled_height_,
|
|
pooled_width_,
|
|
spatial_scale_
|
|
);
|
|
std::cout << " prroi_pooling_forward_cuda completed" << std::endl;
|
|
|
|
// Copy result back to GPU
|
|
output.copy_(output_cpu);
|
|
|
|
return output;
|
|
}
|
|
|
|
// LinearBlock implementation
|
|
LinearBlock::LinearBlock(int in_planes, int out_planes, int input_sz, bool bias, bool batch_norm, bool relu) {
|
|
// Create the linear layer with proper input dimensions
|
|
auto linear_options = torch::nn::LinearOptions(in_planes * input_sz * input_sz, out_planes).bias(bias);
|
|
linear = register_module("linear", torch::nn::Linear(linear_options));
|
|
|
|
use_bn = batch_norm;
|
|
if (use_bn) {
|
|
// Important: use BatchNorm2d to match Python implementation
|
|
bn = register_module("bn", torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes)));
|
|
}
|
|
|
|
use_relu = relu;
|
|
if (use_relu) {
|
|
relu_ = register_module("relu", torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true)));
|
|
}
|
|
}
|
|
|
|
torch::Tensor LinearBlock::forward(torch::Tensor x) {
|
|
// Store original dtype for later
|
|
auto original_dtype = x.dtype();
|
|
|
|
// Use double precision for higher accuracy
|
|
auto x_double = x.to(torch::kFloat64);
|
|
|
|
// Reshape exactly as in Python: x.reshape(x.shape[0], -1)
|
|
x_double = x_double.reshape({x_double.size(0), -1}).contiguous();
|
|
|
|
// Convert back to original precision for the linear operation
|
|
auto x_float = x_double.to(original_dtype);
|
|
x_float = linear->forward(x_float);
|
|
|
|
// Back to double precision for further operations
|
|
x_double = x_float.to(torch::kFloat64);
|
|
|
|
if (use_bn) {
|
|
// This is crucial: reshape to 4D tensor for BatchNorm2d exactly as in Python
|
|
// In Python: x = self.bn(x.reshape(x.shape[0], x.shape[1], 1, 1))
|
|
x_double = x_double.reshape({x_double.size(0), x_double.size(1), 1, 1}).contiguous();
|
|
|
|
// Apply batch norm (convert to float32 for the operation)
|
|
x_float = x_double.to(original_dtype);
|
|
x_float = bn->forward(x_float);
|
|
x_double = x_float.to(torch::kFloat64);
|
|
}
|
|
|
|
// Apply ReLU if needed
|
|
if (use_relu) {
|
|
// Apply ReLU in float32 precision
|
|
x_float = x_double.to(original_dtype);
|
|
x_float = relu_->forward(x_float);
|
|
x_double = x_float.to(torch::kFloat64);
|
|
}
|
|
|
|
// Final reshape to 2D tensor, exactly matching Python's behavior
|
|
x_double = x_double.reshape({x_double.size(0), -1}).contiguous();
|
|
|
|
// Return tensor in original precision
|
|
return x_double.to(original_dtype);
|
|
}
|
|
|
|
// Create convolutional block
|
|
torch::nn::Sequential BBRegressor::create_conv_block(int in_planes, int out_planes,
|
|
int kernel_size, int stride,
|
|
int padding, int dilation) {
|
|
// Print dimensions for debugging
|
|
std::cout << "Creating conv block: in_planes=" << in_planes << ", out_planes=" << out_planes << std::endl;
|
|
|
|
torch::nn::Sequential seq;
|
|
|
|
// Add convolutional layer
|
|
seq->push_back(torch::nn::Conv2d(torch::nn::Conv2dOptions(in_planes, out_planes, kernel_size)
|
|
.stride(stride).padding(padding).dilation(dilation).bias(true)));
|
|
|
|
// Add batch normalization layer
|
|
seq->push_back(torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes)));
|
|
|
|
// Add ReLU activation
|
|
seq->push_back(torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true)));
|
|
|
|
return seq;
|
|
}
|
|
|
|
// Helper function to verify BatchNorm dimensions
|
|
void BBRegressor::verify_batchnorm_dimensions() {
|
|
std::cout << "Verifying BatchNorm dimensions..." << std::endl;
|
|
|
|
// Get children of conv3_1r
|
|
std::cout << "conv3_1r has " << conv3_1r->size() << " modules" << std::endl;
|
|
if (conv3_1r->size() > 1) {
|
|
auto module = conv3_1r[1];
|
|
std::cout << "conv3_1r module[1] type: " << module->name() << std::endl;
|
|
}
|
|
|
|
// Get children of conv3_1t
|
|
std::cout << "conv3_1t has " << conv3_1t->size() << " modules" << std::endl;
|
|
if (conv3_1t->size() > 1) {
|
|
auto module = conv3_1t[1];
|
|
std::cout << "conv3_1t module[1] type: " << module->name() << std::endl;
|
|
}
|
|
|
|
// Get children of conv3_2t
|
|
std::cout << "conv3_2t has " << conv3_2t->size() << " modules" << std::endl;
|
|
if (conv3_2t->size() > 1) {
|
|
auto module = conv3_2t[1];
|
|
std::cout << "conv3_2t module[1] type: " << module->name() << std::endl;
|
|
}
|
|
}
|
|
|
|
// Helper function to read file to bytes
|
|
std::vector<char> BBRegressor::read_file_to_bytes(const std::string& file_path) {
|
|
std::ifstream file(file_path, std::ios::binary | std::ios::ate);
|
|
if (!file.is_open()) {
|
|
throw std::runtime_error("Could not open file: " + file_path);
|
|
}
|
|
|
|
std::streamsize size = file.tellg();
|
|
file.seekg(0, std::ios::beg);
|
|
|
|
std::vector<char> buffer(size);
|
|
if (!file.read(buffer.data(), size)) {
|
|
throw std::runtime_error("Could not read file: " + file_path);
|
|
}
|
|
|
|
return buffer;
|
|
}
|
|
|
|
// Load tensor from file
|
|
torch::Tensor BBRegressor::load_tensor(const std::string& file_path) {
|
|
try {
|
|
// Read file into bytes first
|
|
std::vector<char> data = read_file_to_bytes(file_path);
|
|
// Use pickle_load with byte data
|
|
torch::Tensor tensor = torch::pickle_load(data).toTensor();
|
|
|
|
// Always move tensor to the specified device
|
|
return tensor.to(device);
|
|
} catch (const c10::Error& e) {
|
|
std::cerr << "Error loading tensor from " << file_path << ": " << e.what() << std::endl;
|
|
throw;
|
|
}
|
|
}
|
|
|
|
// Constructor
|
|
BBRegressor::BBRegressor(const std::string& model_weights_dir, torch::Device dev)
|
|
: device(dev), model_dir(model_weights_dir),
|
|
fc3_rt(256, 256, 5, true, true, true),
|
|
fc4_rt(256, 256, 3, true, true, true) {
|
|
|
|
// Check if model directory exists
|
|
if (!fs::exists(model_dir)) {
|
|
throw std::runtime_error("Model directory does not exist: " + model_dir);
|
|
}
|
|
|
|
// Initialize convolution blocks - match Python's AtomIoUNet implementation exactly
|
|
std::cout << "Initializing conv blocks..." << std::endl;
|
|
|
|
// In Python: self.conv3_1r = conv(input_dim[0], 128, kernel_size=3, stride=1)
|
|
conv3_1r = create_conv_block(512, 128, 3, 1, 1, 1);
|
|
|
|
// In Python: self.conv3_1t = conv(input_dim[0], 256, kernel_size=3, stride=1)
|
|
conv3_1t = create_conv_block(512, 256, 3, 1, 1, 1);
|
|
|
|
// In Python: self.conv3_2t = conv(256, pred_input_dim[0], kernel_size=3, stride=1)
|
|
conv3_2t = create_conv_block(256, 256, 3, 1, 1, 1);
|
|
|
|
// Update pooling sizes to match the Python model exactly
|
|
// In Python: self.prroi_pool3r = PrRoIPool2D(3, 3, 1/8)
|
|
prroi_pool3r = std::make_shared<PrRoIPool2D>(3, 3, 0.125); // 1/8 scale for layer2
|
|
|
|
// In Python: self.prroi_pool3t = PrRoIPool2D(5, 5, 1/8)
|
|
prroi_pool3t = std::make_shared<PrRoIPool2D>(5, 5, 0.125); // 1/8 scale for layer2
|
|
|
|
// Create sequential blocks
|
|
// In Python: self.fc3_1r = conv(128, 256, kernel_size=3, stride=1, padding=0)
|
|
fc3_1r = create_conv_block(128, 256, 3, 1, 0, 1); // padding=0 for this layer
|
|
|
|
// In Python: self.conv4_1r = conv(input_dim[1], 256, kernel_size=3, stride=1)
|
|
conv4_1r = create_conv_block(1024, 256, 3, 1, 1, 1);
|
|
|
|
// In Python: self.conv4_1t = conv(input_dim[1], 256, kernel_size=3, stride=1)
|
|
conv4_1t = create_conv_block(1024, 256, 3, 1, 1, 1);
|
|
|
|
// In Python: self.conv4_2t = conv(256, pred_input_dim[1], kernel_size=3, stride=1)
|
|
conv4_2t = create_conv_block(256, 256, 3, 1, 1, 1);
|
|
|
|
// In Python: self.prroi_pool4r = PrRoIPool2D(1, 1, 1/16)
|
|
prroi_pool4r = std::make_shared<PrRoIPool2D>(1, 1, 0.0625); // 1/16 scale for layer3
|
|
|
|
// In Python: self.prroi_pool4t = PrRoIPool2D(3, 3, 1/16)
|
|
prroi_pool4t = std::make_shared<PrRoIPool2D>(3, 3, 0.0625); // 1/16 scale for layer3
|
|
|
|
// In Python: self.fc34_3r = conv(256 + 256, pred_input_dim[0], kernel_size=1, stride=1, padding=0)
|
|
fc34_3r = create_conv_block(512, 256, 1, 1, 0, 1); // kernel_size=1, padding=0
|
|
|
|
// In Python: self.fc34_4r = conv(256 + 256, pred_input_dim[1], kernel_size=1, stride=1, padding=0)
|
|
fc34_4r = create_conv_block(512, 256, 1, 1, 0, 1); // kernel_size=1, padding=0
|
|
|
|
// Linear blocks - exactly match Python's implementation dimensions and parameters
|
|
// In Python: self.fc3_rt = LinearBlock(pred_input_dim[0], pred_inter_dim[0], 5)
|
|
fc3_rt = LinearBlock(256, 256, 5, true, true, true);
|
|
|
|
// In Python: self.fc4_rt = LinearBlock(pred_input_dim[1], pred_inter_dim[1], 3)
|
|
fc4_rt = LinearBlock(256, 256, 3, true, true, true);
|
|
|
|
// In Python: self.iou_predictor = nn.Linear(pred_inter_dim[0]+pred_inter_dim[1], 1, bias=True)
|
|
iou_predictor = torch::nn::Linear(torch::nn::LinearOptions(256 + 256, 1).bias(true));
|
|
|
|
// Load all weights
|
|
load_weights();
|
|
|
|
// Set the model to evaluation mode
|
|
this->eval();
|
|
|
|
// Debug information
|
|
std::cout << "BB Regressor initialized in evaluation mode" << std::endl;
|
|
}
|
|
|
|
// Set the model to evaluation mode
|
|
void BBRegressor::eval() {
|
|
// Set all sequential modules to eval mode
|
|
conv3_1r->eval();
|
|
conv3_1t->eval();
|
|
conv3_2t->eval();
|
|
fc3_1r->eval();
|
|
conv4_1r->eval();
|
|
conv4_1t->eval();
|
|
conv4_2t->eval();
|
|
fc34_3r->eval();
|
|
fc34_4r->eval();
|
|
|
|
// Linear blocks also need to be set to eval mode for BatchNorm layers
|
|
fc3_rt.eval();
|
|
fc4_rt.eval();
|
|
|
|
// Set linear layers to eval mode (though this usually doesn't have any effect)
|
|
iou_predictor->eval();
|
|
}
|
|
|
|
// Load weights
|
|
void BBRegressor::load_weights() {
|
|
// Helper lambda to load weights for a sequential module
|
|
auto load_sequential_weights = [this](torch::nn::Sequential& seq, const std::string& prefix) {
|
|
try {
|
|
// Load weights for conv layer (index 0)
|
|
std::string weight_path = model_dir + "/" + prefix + "_0_weight.pt";
|
|
std::string bias_path = model_dir + "/" + prefix + "_0_bias.pt";
|
|
|
|
if (fs::exists(weight_path) && fs::exists(bias_path)) {
|
|
auto conv_weight = load_tensor(weight_path);
|
|
auto conv_bias = load_tensor(bias_path);
|
|
|
|
// Get the conv2d module from sequential
|
|
// Fix: Get the number of output channels from the weight tensor
|
|
int out_channels = conv_weight.size(0);
|
|
int in_channels = conv_weight.size(1);
|
|
int kernel_size = conv_weight.size(2);
|
|
|
|
std::cout << "Loading " << prefix << " conv weights: "
|
|
<< "[out_ch=" << out_channels
|
|
<< ", in_ch=" << in_channels
|
|
<< ", kernel=" << kernel_size << "]" << std::endl;
|
|
|
|
// FIXED: Use the correct padding based on the layer name
|
|
int padding = 1; // Default padding
|
|
|
|
// Special cases for layers with different padding
|
|
if (prefix == "fc3_1r" || prefix == "fc34_3r" || prefix == "fc34_4r") {
|
|
padding = 0; // These layers use padding=0 in the Python implementation
|
|
}
|
|
|
|
std::cout << " Using padding=" << padding << " for " << prefix << std::endl;
|
|
|
|
auto conv_options = torch::nn::Conv2dOptions(in_channels, out_channels, kernel_size)
|
|
.stride(1).padding(padding).bias(true);
|
|
auto conv_module = torch::nn::Conv2d(conv_options);
|
|
|
|
// Set weights and bias
|
|
conv_module->weight = conv_weight;
|
|
conv_module->bias = conv_bias;
|
|
|
|
// Debug info - print some weight stats
|
|
std::cout << " Conv weight stats: mean=" << conv_weight.mean().item<float>()
|
|
<< ", std=" << conv_weight.std().item<float>()
|
|
<< ", min=" << conv_weight.min().item<float>()
|
|
<< ", max=" << conv_weight.max().item<float>() << std::endl;
|
|
|
|
// Create a new sequence with the proper conv module
|
|
auto new_seq = torch::nn::Sequential();
|
|
new_seq->push_back(conv_module);
|
|
|
|
// Load batch norm parameters (index 1)
|
|
std::string bn_weight_path = model_dir + "/" + prefix + "_1_weight.pt";
|
|
std::string bn_bias_path = model_dir + "/" + prefix + "_1_bias.pt";
|
|
std::string bn_mean_path = model_dir + "/" + prefix + "_1_running_mean.pt";
|
|
std::string bn_var_path = model_dir + "/" + prefix + "_1_running_var.pt";
|
|
|
|
if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) &&
|
|
fs::exists(bn_mean_path) && fs::exists(bn_var_path)) {
|
|
|
|
auto bn_weight = load_tensor(bn_weight_path);
|
|
auto bn_bias = load_tensor(bn_bias_path);
|
|
auto bn_mean = load_tensor(bn_mean_path);
|
|
auto bn_var = load_tensor(bn_var_path);
|
|
|
|
// Important: Create BatchNorm with the correct number of features from the weights
|
|
int num_features = bn_weight.size(0);
|
|
std::cout << " Creating BatchNorm2d with num_features=" << num_features << std::endl;
|
|
|
|
// Create a proper batch norm module with the right number of features
|
|
auto bn_options = torch::nn::BatchNorm2dOptions(num_features)
|
|
.eps(1e-5) // Match Python default
|
|
.momentum(0.1) // Match Python default
|
|
.affine(true)
|
|
.track_running_stats(true);
|
|
auto bn_module = torch::nn::BatchNorm2d(bn_options);
|
|
|
|
// Set batch norm parameters
|
|
bn_module->weight = bn_weight;
|
|
bn_module->bias = bn_bias;
|
|
bn_module->running_mean = bn_mean;
|
|
bn_module->running_var = bn_var;
|
|
|
|
// Debug info - print some batch norm stats
|
|
std::cout << " BN weight stats: mean=" << bn_weight.mean().item<float>()
|
|
<< ", std=" << bn_weight.std().item<float>() << std::endl;
|
|
std::cout << " BN running_mean stats: mean=" << bn_mean.mean().item<float>()
|
|
<< ", std=" << bn_mean.std().item<float>() << std::endl;
|
|
std::cout << " BN running_var stats: mean=" << bn_var.mean().item<float>()
|
|
<< ", std=" << bn_var.std().item<float>() << std::endl;
|
|
|
|
// Add the batch norm module to the sequence
|
|
new_seq->push_back(bn_module);
|
|
}
|
|
|
|
// Add the ReLU module with inplace=true to match Python
|
|
auto relu_options = torch::nn::ReLUOptions().inplace(true);
|
|
new_seq->push_back(torch::nn::ReLU(relu_options));
|
|
|
|
// Replace the old sequence with the new one
|
|
seq = new_seq;
|
|
|
|
std::cout << "Loaded weights for " << prefix << std::endl;
|
|
} else {
|
|
std::cerr << "Weight files not found for " << prefix << std::endl;
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl;
|
|
throw; // Re-throw to stop execution
|
|
}
|
|
};
|
|
|
|
// Load weights for linear blocks
|
|
auto load_linear_block_weights = [this](LinearBlock& block, const std::string& prefix) {
|
|
try {
|
|
// Load weights for linear layer
|
|
std::string weight_path = model_dir + "/" + prefix + "_linear_weight.pt";
|
|
std::string bias_path = model_dir + "/" + prefix + "_linear_bias.pt";
|
|
|
|
if (fs::exists(weight_path) && fs::exists(bias_path)) {
|
|
auto linear_weight = load_tensor(weight_path);
|
|
auto linear_bias = load_tensor(bias_path);
|
|
|
|
// Set weights and bias
|
|
block.linear->weight = linear_weight;
|
|
block.linear->bias = linear_bias;
|
|
|
|
// Load batch norm parameters
|
|
std::string bn_weight_path = model_dir + "/" + prefix + "_bn_weight.pt";
|
|
std::string bn_bias_path = model_dir + "/" + prefix + "_bn_bias.pt";
|
|
std::string bn_mean_path = model_dir + "/" + prefix + "_bn_running_mean.pt";
|
|
std::string bn_var_path = model_dir + "/" + prefix + "_bn_running_var.pt";
|
|
|
|
if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) &&
|
|
fs::exists(bn_mean_path) && fs::exists(bn_var_path)) {
|
|
auto bn_weight = load_tensor(bn_weight_path);
|
|
auto bn_bias = load_tensor(bn_bias_path);
|
|
auto bn_mean = load_tensor(bn_mean_path);
|
|
auto bn_var = load_tensor(bn_var_path);
|
|
|
|
// Set batch norm parameters
|
|
block.bn->weight = bn_weight;
|
|
block.bn->bias = bn_bias;
|
|
block.bn->running_mean = bn_mean;
|
|
block.bn->running_var = bn_var;
|
|
}
|
|
|
|
std::cout << "Loaded weights for " << prefix << std::endl;
|
|
} else {
|
|
std::cerr << "Weight files not found for " << prefix << std::endl;
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl;
|
|
throw; // Re-throw to stop execution
|
|
}
|
|
};
|
|
|
|
// Load weights for all layers
|
|
load_sequential_weights(conv3_1r, "conv3_1r");
|
|
load_sequential_weights(conv3_1t, "conv3_1t");
|
|
load_sequential_weights(conv3_2t, "conv3_2t");
|
|
load_sequential_weights(fc3_1r, "fc3_1r");
|
|
load_sequential_weights(conv4_1r, "conv4_1r");
|
|
load_sequential_weights(conv4_1t, "conv4_1t");
|
|
load_sequential_weights(conv4_2t, "conv4_2t");
|
|
load_sequential_weights(fc34_3r, "fc34_3r");
|
|
load_sequential_weights(fc34_4r, "fc34_4r");
|
|
|
|
load_linear_block_weights(fc3_rt, "fc3_rt");
|
|
load_linear_block_weights(fc4_rt, "fc4_rt");
|
|
|
|
// Load IoU predictor weights
|
|
try {
|
|
std::string weight_path = model_dir + "/iou_predictor_weight.pt";
|
|
std::string bias_path = model_dir + "/iou_predictor_bias.pt";
|
|
|
|
if (fs::exists(weight_path) && fs::exists(bias_path)) {
|
|
auto weight = load_tensor(weight_path);
|
|
auto bias = load_tensor(bias_path);
|
|
|
|
iou_predictor->weight = weight;
|
|
iou_predictor->bias = bias;
|
|
|
|
std::cout << "Loaded weights for iou_predictor" << std::endl;
|
|
} else {
|
|
std::cerr << "Weight files not found for iou_predictor" << std::endl;
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cerr << "Error loading weights for iou_predictor: " << e.what() << std::endl;
|
|
throw; // Re-throw to stop execution
|
|
}
|
|
}
|
|
|
|
// Move model to device
|
|
void BBRegressor::to(torch::Device device) {
|
|
// Verify the device is a CUDA device
|
|
if (!device.is_cuda()) {
|
|
throw std::runtime_error("BBRegressor requires a CUDA device");
|
|
}
|
|
|
|
this->device = device;
|
|
|
|
// Move all components to device
|
|
conv3_1r->to(device);
|
|
conv3_1t->to(device);
|
|
conv3_2t->to(device);
|
|
fc3_1r->to(device);
|
|
conv4_1r->to(device);
|
|
conv4_1t->to(device);
|
|
conv4_2t->to(device);
|
|
fc3_rt.to(device);
|
|
fc4_rt.to(device);
|
|
|
|
iou_predictor->to(device);
|
|
}
|
|
|
|
// Get IoU features from backbone features
|
|
std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat2) {
|
|
// Convert to double precision for better numerical stability
|
|
auto feat2_double0 = feat2[0].to(torch::kFloat64);
|
|
auto feat2_double1 = feat2[1].to(torch::kFloat64);
|
|
|
|
// Reshape exactly as in Python implementation
|
|
// In Python: feat2 = [f.reshape(-1, *f.shape[-3:]) if f.dim()==5 else f for f in feat2]
|
|
if (feat2_double0.dim() == 5) {
|
|
auto shape = feat2_double0.sizes();
|
|
feat2_double0 = feat2_double0.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
|
|
}
|
|
|
|
if (feat2_double1.dim() == 5) {
|
|
auto shape = feat2_double1.sizes();
|
|
feat2_double1 = feat2_double1.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
|
|
}
|
|
|
|
// Convert back to float32 for convolution operations
|
|
feat2[0] = feat2_double0.to(torch::kFloat32).contiguous();
|
|
feat2[1] = feat2_double1.to(torch::kFloat32).contiguous();
|
|
|
|
// Apply convolutions exactly as in Python
|
|
torch::Tensor feat3_t = feat2[0];
|
|
torch::Tensor feat4_t = feat2[1];
|
|
|
|
// Ensure we're in evaluation mode
|
|
torch::NoGradGuard no_grad;
|
|
|
|
// Apply convolutions just like Python version
|
|
torch::Tensor c3_t_1 = conv3_1t->forward(feat3_t);
|
|
c3_t_1 = c3_t_1.contiguous();
|
|
|
|
torch::Tensor c3_t = conv3_2t->forward(c3_t_1);
|
|
c3_t = c3_t.contiguous();
|
|
|
|
torch::Tensor c4_t_1 = conv4_1t->forward(feat4_t);
|
|
c4_t_1 = c4_t_1.contiguous();
|
|
|
|
torch::Tensor c4_t = conv4_2t->forward(c4_t_1);
|
|
c4_t = c4_t.contiguous();
|
|
|
|
// Return results
|
|
return {c3_t, c4_t};
|
|
}
|
|
|
|
// Get modulation vectors for the target
|
|
std::vector<torch::Tensor> BBRegressor::get_modulation(std::vector<torch::Tensor> feat, torch::Tensor bb) {
|
|
// feat should contain two tensors: feat3_r and feat4_r (backbone features)
|
|
// bb is the initial bounding box [batch_size, 1, 4] (x,y,w,h) or [batch_size, 4]
|
|
// Ensure inputs are on the correct device
|
|
torch::NoGradGuard no_grad; // Ensure no gradients are computed
|
|
|
|
auto feat3_r = feat[0].to(device);
|
|
auto feat4_r = feat[1].to(device);
|
|
auto current_bb = bb.to(device);
|
|
|
|
// Reshape bb if it's [batch, 1, 4] to [batch, 4]
|
|
if (current_bb.dim() == 3 && current_bb.size(1) == 1) {
|
|
current_bb = current_bb.squeeze(1);
|
|
}
|
|
if (current_bb.dim() != 2 || current_bb.size(1) != 4) {
|
|
throw std::runtime_error("BBRegressor::get_modulation: bb must be [batch, 4] or [batch, 1, 4]");
|
|
}
|
|
|
|
// Pass through early conv layers (reference branch)
|
|
// Python: c3_r = self.conv3_1r(feat3_r)
|
|
auto c3_r = conv3_1r->forward(feat3_r);
|
|
|
|
// Prepare ROIs: convert bb from [x,y,w,h] to [batch_idx, x1,y1,x2,y2]
|
|
int batch_size = current_bb.size(0);
|
|
auto batch_idx = torch::arange(0, batch_size, current_bb.options().dtype(torch::kFloat)).unsqueeze(1);
|
|
|
|
auto rois = torch::zeros({batch_size, 5}, current_bb.options());
|
|
rois.index_put_({torch::indexing::Slice(), 0}, batch_idx.squeeze(1)); // batch index
|
|
rois.index_put_({torch::indexing::Slice(), 1}, current_bb.index({torch::indexing::Slice(), 0})); // x1
|
|
rois.index_put_({torch::indexing::Slice(), 2}, current_bb.index({torch::indexing::Slice(), 1})); // y1
|
|
rois.index_put_({torch::indexing::Slice(), 3}, current_bb.index({torch::indexing::Slice(), 0}) + current_bb.index({torch::indexing::Slice(), 2})); // x2 = x1 + w
|
|
rois.index_put_({torch::indexing::Slice(), 4}, current_bb.index({torch::indexing::Slice(), 1}) + current_bb.index({torch::indexing::Slice(), 3})); // y2 = y1 + h
|
|
|
|
rois = rois.to(device); // Ensure ROIs are on the correct device
|
|
|
|
std::cout << " BBRegressor::get_modulation: Converted ROIs (first item): [";
|
|
if (batch_size > 0) {
|
|
for (int j = 0; j < rois.size(1); j++) {
|
|
std::cout << rois[0][j].item<float>();
|
|
if (j < rois.size(1) - 1) std::cout << ", ";
|
|
}
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
std::cout << " BBRegressor::get_modulation: c3_r shape: " << c3_r.sizes() << ", device: " << c3_r.device() << std::endl;
|
|
|
|
|
|
// Python: roi3r = self.prroi_pool3r(c3_r, roi1)
|
|
auto roi3r = prroi_pool3r->forward(c3_r, rois);
|
|
std::cout << " BBRegressor::get_modulation: roi3r shape: " << roi3r.sizes() << std::endl;
|
|
|
|
// Python: c4_r = self.conv4_1r(feat4_r)
|
|
auto c4_r = conv4_1r->forward(feat4_r);
|
|
std::cout << " BBRegressor::get_modulation: c4_r shape: " << c4_r.sizes() << ", device: " << c4_r.device() << std::endl;
|
|
|
|
// Python: roi4r = self.prroi_pool4r(c4_r, roi1)
|
|
auto roi4r = prroi_pool4r->forward(c4_r, rois);
|
|
std::cout << " BBRegressor::get_modulation: roi4r shape: " << roi4r.sizes() << std::endl;
|
|
|
|
|
|
// Python: fc3_r = self.fc3_1r(roi3r)
|
|
// fc3_1r is a conv block: conv(128, 256, kernel_size=3, stride=1, padding=0)
|
|
// Input roi3r is (batch, 128, 3, 3) -> Output fc3_r is (batch, 256, 1, 1)
|
|
auto fc3_r = fc3_1r->forward(roi3r);
|
|
std::cout << " BBRegressor::get_modulation: fc3_r shape: " << fc3_r.sizes() << std::endl;
|
|
|
|
// Python: fc34_r = torch.cat((fc3_r, roi4r), dim=1)
|
|
// fc3_r is (batch, 256, 1, 1), roi4r is (batch, 256, 1, 1)
|
|
// Result fc34_r is (batch, 512, 1, 1)
|
|
auto fc34_r = torch::cat({fc3_r, roi4r}, 1);
|
|
std::cout << " BBRegressor::get_modulation: fc34_r shape: " << fc34_r.sizes() << std::endl;
|
|
|
|
// Python: fc34_3_r = self.fc34_3r(fc34_r)
|
|
// fc34_3r is conv(512, 256, kernel_size=1, stride=1, padding=0)
|
|
// Output fc34_3_r is (batch, 256, 1, 1)
|
|
auto mod_vec1 = fc34_3r->forward(fc34_r);
|
|
std::cout << " BBRegressor::get_modulation: mod_vec1 (fc34_3_r) shape: " << mod_vec1.sizes() << std::endl;
|
|
|
|
// Python: fc34_4_r = self.fc34_4r(fc34_r)
|
|
// fc34_4r is conv(512, 256, kernel_size=1, stride=1, padding=0)
|
|
// Output fc34_4_r is (batch, 256, 1, 1)
|
|
auto mod_vec2 = fc34_4r->forward(fc34_r);
|
|
std::cout << " BBRegressor::get_modulation: mod_vec2 (fc34_4_r) shape: " << mod_vec2.sizes() << std::endl;
|
|
|
|
return {mod_vec1, mod_vec2};
|
|
}
|
|
|
|
// Predict IoU for proposals
|
|
torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation,
|
|
std::vector<torch::Tensor> feat,
|
|
torch::Tensor proposals) {
|
|
// Ensure all inputs are on the correct device
|
|
auto target_device = device; // Assuming 'device' is a member of BBRegressor
|
|
for (auto& t : feat) { t = t.to(target_device); }
|
|
for (auto& m : modulation) { m = m.to(target_device); }
|
|
proposals = proposals.to(target_device);
|
|
|
|
// Get batch size and number of proposals
|
|
int batch_size = proposals.size(0);
|
|
int num_proposals = proposals.size(1);
|
|
|
|
// Reshape proposals to [batch_size * num_proposals, 4]
|
|
// and add batch index for PrRoIPooling
|
|
auto proposals_view = proposals.reshape({batch_size * num_proposals, 4});
|
|
auto roi_batch_index = torch::arange(0, batch_size, proposals.options().dtype(torch::kInt)).unsqueeze(1);
|
|
roi_batch_index = roi_batch_index.repeat_interleave(num_proposals, 0);
|
|
auto roi = torch::cat(std::vector<torch::Tensor>{roi_batch_index.to(proposals_view.options()), proposals_view}, 1);
|
|
|
|
// Ensure ROI is on the correct device, matching features
|
|
auto feat_device = feat[0].device();
|
|
roi = roi.to(feat_device);
|
|
|
|
// Apply ROI pooling to get features for each proposal
|
|
auto pooled_feat1 = prroi_pool3t->forward(feat[0], roi); // Output: [batch_size * num_proposals, C, 5, 5]
|
|
auto pooled_feat2 = prroi_pool4t->forward(feat[1], roi); // Output: [batch_size * num_proposals, C, 3, 3]
|
|
|
|
std::cout << " Pooled shapes:" << std::endl;
|
|
std::cout << " pooled_feat1 (from prroi_pool3t on feat[0]): [" << pooled_feat1.sizes() << "] dev: " << pooled_feat1.device() << std::endl;
|
|
std::cout << " pooled_feat2 (from prroi_pool4t on feat[1]): [" << pooled_feat2.sizes() << "] dev: " << pooled_feat2.device() << std::endl;
|
|
|
|
std::cout << " IoU predictor dimensions:" << std::endl;
|
|
std::cout << " weight: [" << iou_predictor->weight.sizes() << "]" << std::endl;
|
|
std::cout << " bias: [" << iou_predictor->bias.sizes() << "]" << std::endl;
|
|
|
|
try {
|
|
auto mod0_4d = modulation[0].to(target_device);
|
|
auto mod1_4d = modulation[1].to(target_device);
|
|
|
|
if (mod0_4d.dim() == 2) {
|
|
mod0_4d = mod0_4d.reshape({mod0_4d.size(0), mod0_4d.size(1), 1, 1});
|
|
}
|
|
if (mod1_4d.dim() == 2) {
|
|
mod1_4d = mod1_4d.reshape({mod1_4d.size(0), mod1_4d.size(1), 1, 1});
|
|
}
|
|
|
|
if (mod0_4d.size(0) == 1 && pooled_feat1.size(0) > 1) {
|
|
mod0_4d = mod0_4d.repeat({pooled_feat1.size(0), 1, 1, 1});
|
|
}
|
|
if (mod1_4d.size(0) == 1 && pooled_feat2.size(0) > 1) {
|
|
mod1_4d = mod1_4d.repeat({pooled_feat2.size(0), 1, 1, 1});
|
|
}
|
|
|
|
std::cout << " Modulation vector shapes (reshaped 4D):" << std::endl;
|
|
std::cout << " mod0_4d: [" << mod0_4d.sizes() << "] dev: " << mod0_4d.device() << std::endl;
|
|
std::cout << " mod1_4d: [" << mod1_4d.sizes() << "] dev: " << mod1_4d.device() << std::endl;
|
|
|
|
auto feat_prod_0 = pooled_feat1 * mod0_4d;
|
|
auto feat_prod_1 = pooled_feat2 * mod1_4d;
|
|
std::cout << " After element-wise product with modulation:\n feat_prod_0 (pooled_feat1 * mod0_4d): [" << feat_prod_0.sizes() << "] dev: " << feat_prod_0.device() << "\n feat_prod_1 (pooled_feat2 * mod1_4d): [" << feat_prod_1.sizes() << "] dev: " << feat_prod_1.device() << std::endl;
|
|
|
|
std::cout << " Applying fc3_rt to feat_prod_0..." << std::endl;
|
|
auto x0 = fc3_rt.forward(feat_prod_0); // Corrected: . instead of ->
|
|
std::cout << " Applying fc4_rt to feat_prod_1..." << std::endl;
|
|
auto x1 = fc4_rt.forward(feat_prod_1); // Corrected: . instead of ->
|
|
std::cout << " After fc_rt blocks:\n x0 (fc3_rt output): [" << x0.sizes() << "] dev: " << x0.device() << "\n x1 (fc4_rt output): [" << x1.sizes() << "] dev: " << x1.device() << std::endl;
|
|
|
|
auto ioufeat_final = torch::cat(std::vector<torch::Tensor>{x0, x1}, 1).contiguous(); // Corrected: std::vector wrapper
|
|
std::cout << " Concatenated ioufeat_final: [" << ioufeat_final.sizes() << "] dev: " << ioufeat_final.device() << std::endl;
|
|
|
|
torch::Tensor iou_scores;
|
|
try {
|
|
std::cout << " Applying final iou_predictor on GPU" << std::endl;
|
|
iou_predictor->to(target_device);
|
|
iou_scores = iou_predictor->forward(ioufeat_final.to(target_device));
|
|
std::cout << " Final iou_predictor on GPU successful. Output scores shape: [" << iou_scores.sizes() << "]" << std::endl;
|
|
|
|
} catch (const std::exception& cuda_error) {
|
|
std::cout << " GPU iou_predictor->forward() failed: " << cuda_error.what() << std::endl;
|
|
std::cout << " Falling back to CPU for final iou_predictor" << std::endl;
|
|
|
|
auto ioufeat_final_cpu = ioufeat_final.to(torch::kCPU).contiguous();
|
|
auto weight_cpu = iou_predictor->weight.to(torch::kCPU).contiguous();
|
|
auto bias_cpu = torch::Tensor();
|
|
if (iou_predictor->bias.defined()) {
|
|
bias_cpu = iou_predictor->bias.to(torch::kCPU).contiguous();
|
|
}
|
|
|
|
std::cout << " DEBUG CPU Fallback: ioufeat_final_cpu device: " << ioufeat_final_cpu.device() << std::endl;
|
|
std::cout << " DEBUG CPU Fallback: weight_cpu device: " << weight_cpu.device() << std::endl;
|
|
if (bias_cpu.defined()) {
|
|
std::cout << " DEBUG CPU Fallback: bias_cpu device: " << bias_cpu.device() << std::endl;
|
|
} else {
|
|
std::cout << " DEBUG CPU Fallback: bias_cpu is undefined." << std::endl;
|
|
}
|
|
|
|
try {
|
|
iou_scores = torch::nn::functional::linear(ioufeat_final_cpu, weight_cpu, bias_cpu);
|
|
std::cout << " CPU fallback torch::nn::functional::linear() successful. Output device: " << iou_scores.device() << std::endl;
|
|
} catch (const std::exception& cpu_fwd_error) {
|
|
std::cerr << "ERROR during CPU torch::nn::functional::linear(): " << cpu_fwd_error.what() << std::endl;
|
|
iou_predictor->to(target_device);
|
|
throw;
|
|
}
|
|
|
|
iou_predictor->to(target_device);
|
|
iou_scores = iou_scores.to(target_device);
|
|
}
|
|
|
|
std::cout << " iou_scores raw shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;
|
|
|
|
iou_scores = iou_scores.reshape({batch_size, num_proposals});
|
|
std::cout << " Final iou_scores shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;
|
|
|
|
return iou_scores;
|
|
|
|
} catch (const std::exception& e) {
|
|
std::cerr << "CRITICAL: Unexpected error in predict_iou: " << e.what() << std::endl;
|
|
std::cout << " Propagating critical error. No fallback available for this stage." << std::endl;
|
|
throw;
|
|
}
|
|
}
|
|
|
|
// Print model information
|
|
void BBRegressor::print_model_info() {
|
|
std::cout << "BBRegressor Model Information:" << std::endl;
|
|
std::cout << " - Model directory: " << model_dir << std::endl;
|
|
std::cout << " - Device: CUDA:" << device.index() << std::endl;
|
|
std::cout << " - CUDA Device Count: " << torch::cuda::device_count() << std::endl;
|
|
std::cout << " - Using PreciseRoIPooling: " <<
|
|
#ifdef WITH_PRROI_POOLING
|
|
"Yes"
|
|
#else
|
|
"No (will fail)"
|
|
#endif
|
|
<< std::endl;
|
|
}
|
|
|
|
// Compute statistics for a tensor
|
|
BBRegressor::TensorStats BBRegressor::compute_stats(const torch::Tensor& tensor) {
|
|
TensorStats stats;
|
|
|
|
// Get shape
|
|
for (int i = 0; i < tensor.dim(); i++) {
|
|
stats.shape.push_back(tensor.size(i));
|
|
}
|
|
|
|
// Compute basic stats - make sure we reduce to scalar values
|
|
stats.mean = tensor.mean().item<float>(); // Mean of all elements
|
|
stats.std_dev = tensor.std().item<float>(); // Std dev of all elements
|
|
stats.min_val = tensor.min().item<float>(); // Min of all elements
|
|
stats.max_val = tensor.max().item<float>(); // Max of all elements
|
|
stats.sum = tensor.sum().item<float>(); // Sum of all elements
|
|
|
|
// Sample values at specific positions
|
|
if (tensor.dim() >= 4) {
|
|
// For 4D tensors (batch, channel, height, width)
|
|
stats.samples.push_back(tensor.index({0, 0, 0, 0}).item<float>());
|
|
|
|
if (tensor.size(1) > 1 && tensor.size(2) > 1 && tensor.size(3) > 1) {
|
|
int mid_c = static_cast<int>(tensor.size(1) / 2);
|
|
int mid_h = static_cast<int>(tensor.size(2) / 2);
|
|
int mid_w = static_cast<int>(tensor.size(3) / 2);
|
|
stats.samples.push_back(tensor.index({0, mid_c, mid_h, mid_w}).item<float>());
|
|
|
|
// Use static_cast to convert int64_t to int to avoid type mismatch
|
|
int64_t last_c_idx = tensor.size(1) - 1;
|
|
int64_t last_h_idx = tensor.size(2) - 1;
|
|
int64_t last_w_idx = tensor.size(3) - 1;
|
|
|
|
// Limit indices to avoid accessing out of bounds
|
|
if (last_c_idx > 10) last_c_idx = 10;
|
|
if (last_h_idx > 10) last_h_idx = 10;
|
|
if (last_w_idx > 10) last_w_idx = 10;
|
|
|
|
stats.samples.push_back(tensor.index({0, static_cast<int>(last_c_idx),
|
|
static_cast<int>(last_h_idx),
|
|
static_cast<int>(last_w_idx)}).item<float>());
|
|
}
|
|
} else if (tensor.dim() == 3) {
|
|
// For 3D tensors
|
|
stats.samples.push_back(tensor.index({0, 0, 0}).item<float>());
|
|
|
|
if (tensor.size(1) > 1 && tensor.size(2) > 1) {
|
|
int mid_h = static_cast<int>(tensor.size(1) / 2);
|
|
int mid_w = static_cast<int>(tensor.size(2) / 2);
|
|
stats.samples.push_back(tensor.index({0, mid_h, mid_w}).item<float>());
|
|
|
|
int last_h = static_cast<int>(tensor.size(1) - 1);
|
|
int last_w = static_cast<int>(tensor.size(2) - 1);
|
|
stats.samples.push_back(tensor.index({0, last_h, last_w}).item<float>());
|
|
}
|
|
} else if (tensor.dim() == 2) {
|
|
// For 2D tensors
|
|
stats.samples.push_back(tensor.index({0, 0}).item<float>());
|
|
|
|
if (tensor.size(0) > 1 && tensor.size(1) > 1) {
|
|
int mid_h = static_cast<int>(tensor.size(0) / 2);
|
|
int mid_w = static_cast<int>(tensor.size(1) / 2);
|
|
stats.samples.push_back(tensor.index({mid_h, mid_w}).item<float>());
|
|
|
|
int last_h = static_cast<int>(tensor.size(0) - 1);
|
|
int last_w = static_cast<int>(tensor.size(1) - 1);
|
|
stats.samples.push_back(tensor.index({last_h, last_w}).item<float>());
|
|
}
|
|
} else {
|
|
// For 1D tensors or scalars
|
|
if (tensor.numel() > 0) {
|
|
stats.samples.push_back(tensor.index({0}).item<float>());
|
|
|
|
if (tensor.size(0) > 1) {
|
|
int mid = static_cast<int>(tensor.size(0) / 2);
|
|
stats.samples.push_back(tensor.index({mid}).item<float>());
|
|
|
|
int last = static_cast<int>(tensor.size(0) - 1);
|
|
stats.samples.push_back(tensor.index({last}).item<float>());
|
|
}
|
|
}
|
|
}
|
|
|
|
return stats;
|
|
}
|
|
|
|
// Save tensor statistics to a file
|
|
void BBRegressor::save_stats(const std::vector<TensorStats>& all_stats, const std::string& filepath) {
|
|
std::ofstream file(filepath);
|
|
if (!file.is_open()) {
|
|
std::cerr << "Error opening file for writing: " << filepath << std::endl;
|
|
return;
|
|
}
|
|
|
|
for (size_t i = 0; i < all_stats.size(); i++) {
|
|
const auto& stats = all_stats[i];
|
|
file << "Output " << i << ":" << std::endl;
|
|
|
|
file << " Shape: [";
|
|
for (size_t j = 0; j < stats.shape.size(); j++) {
|
|
file << stats.shape[j];
|
|
if (j < stats.shape.size() - 1) file << ", ";
|
|
}
|
|
file << "]" << std::endl;
|
|
|
|
file << " Mean: " << stats.mean << std::endl;
|
|
file << " Std: " << stats.std_dev << std::endl;
|
|
file << " Min: " << stats.min_val << std::endl;
|
|
file << " Max: " << stats.max_val << std::endl;
|
|
file << " Sum: " << stats.sum << std::endl;
|
|
|
|
file << " Sample values: [";
|
|
for (size_t j = 0; j < stats.samples.size(); j++) {
|
|
file << stats.samples[j];
|
|
if (j < stats.samples.size() - 1) file << ", ";
|
|
}
|
|
file << "]" << std::endl << std::endl;
|
|
}
|
|
|
|
file.close();
|
|
}
|