#include "bb_regressor.h" #include #include #include #include #include #include // Add CUDA includes for required CUDA implementation #include #include // Use the PrRoIPooling implementation #include "prroi_pooling_gpu.h" #include "prroi_pooling_gpu_impl.cuh" // PrRoIPool2D implementation (requires CUDA) PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale) : pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale) {} torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) { // Print shape info for debugging std::cout << " PrRoIPool2D inputs: " << std::endl; std::cout << " Features: [" << feat.size(0) << ", " << feat.size(1) << ", " << feat.size(2) << ", " << feat.size(3) << "]" << std::endl; std::cout << " ROIs: [" << rois.size(0) << ", " << rois.size(1) << "]" << std::endl; std::cout << " Pooled size: [" << pooled_height_ << ", " << pooled_width_ << "]" << std::endl; std::cout << " Spatial scale: " << spatial_scale_ << std::endl; // Calculate output shape int channels = feat.size(1); int num_rois = rois.size(0); // Ensure both tensors are on CUDA if (!feat.is_cuda() || !rois.is_cuda()) { throw std::runtime_error("PrRoIPool2D requires CUDA tensors - CPU mode is not supported"); } // Print ROI values for debugging std::cout << " ROI values: " << std::endl; for (int i = 0; i < std::min(num_rois, 3); i++) { std::cout << " ROI " << i << ": ["; for (int j = 0; j < rois.size(1); j++) { std::cout << rois[i][j].item(); if (j < rois.size(1) - 1) std::cout << ", "; } std::cout << "]" << std::endl; } // Create output tensor on the same device auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_}, feat.options()); // Copy tensors to CPU for the C implementation auto feat_cpu = feat.to(torch::kCPU).contiguous(); auto rois_cpu = rois.to(torch::kCPU).contiguous(); auto output_cpu = output.to(torch::kCPU).contiguous(); // Call the C wrapper function std::cout << " Calling prroi_pooling_forward_cuda..." << std::endl; prroi_pooling_forward_cuda( feat_cpu.data_ptr(), static_cast(rois_cpu.data_ptr()), static_cast(output_cpu.data_ptr()), channels, feat.size(2), feat.size(3), num_rois, pooled_height_, pooled_width_, spatial_scale_ ); std::cout << " prroi_pooling_forward_cuda completed" << std::endl; // Copy result back to GPU output.copy_(output_cpu); return output; } // LinearBlock implementation LinearBlock::LinearBlock(int in_planes, int out_planes, int input_sz, bool bias, bool batch_norm, bool relu) { // Create the linear layer with proper input dimensions auto linear_options = torch::nn::LinearOptions(in_planes * input_sz * input_sz, out_planes).bias(bias); linear = register_module("linear", torch::nn::Linear(linear_options)); use_bn = batch_norm; if (use_bn) { // Important: use BatchNorm2d to match Python implementation bn = register_module("bn", torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes))); } use_relu = relu; if (use_relu) { relu_ = register_module("relu", torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true))); } } torch::Tensor LinearBlock::forward(torch::Tensor x) { // Store original dtype for later auto original_dtype = x.dtype(); // Use double precision for higher accuracy auto x_double = x.to(torch::kFloat64); // Reshape exactly as in Python: x.reshape(x.shape[0], -1) x_double = x_double.reshape({x_double.size(0), -1}).contiguous(); // Convert back to original precision for the linear operation auto x_float = x_double.to(original_dtype); x_float = linear->forward(x_float); // Back to double precision for further operations x_double = x_float.to(torch::kFloat64); if (use_bn) { // This is crucial: reshape to 4D tensor for BatchNorm2d exactly as in Python // In Python: x = self.bn(x.reshape(x.shape[0], x.shape[1], 1, 1)) x_double = x_double.reshape({x_double.size(0), x_double.size(1), 1, 1}).contiguous(); // Apply batch norm (convert to float32 for the operation) x_float = x_double.to(original_dtype); x_float = bn->forward(x_float); x_double = x_float.to(torch::kFloat64); } // Apply ReLU if needed if (use_relu) { // Apply ReLU in float32 precision x_float = x_double.to(original_dtype); x_float = relu_->forward(x_float); x_double = x_float.to(torch::kFloat64); } // Final reshape to 2D tensor, exactly matching Python's behavior x_double = x_double.reshape({x_double.size(0), -1}).contiguous(); // Return tensor in original precision return x_double.to(original_dtype); } // Create convolutional block torch::nn::Sequential BBRegressor::create_conv_block(int in_planes, int out_planes, int kernel_size, int stride, int padding, int dilation) { // Print dimensions for debugging std::cout << "Creating conv block: in_planes=" << in_planes << ", out_planes=" << out_planes << std::endl; torch::nn::Sequential seq; // Add convolutional layer seq->push_back(torch::nn::Conv2d(torch::nn::Conv2dOptions(in_planes, out_planes, kernel_size) .stride(stride).padding(padding).dilation(dilation).bias(true))); // Add batch normalization layer seq->push_back(torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(out_planes))); // Add ReLU activation seq->push_back(torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true))); return seq; } // Helper function to verify BatchNorm dimensions void BBRegressor::verify_batchnorm_dimensions() { std::cout << "Verifying BatchNorm dimensions..." << std::endl; // Get children of conv3_1r std::cout << "conv3_1r has " << conv3_1r->size() << " modules" << std::endl; if (conv3_1r->size() > 1) { auto module = conv3_1r[1]; std::cout << "conv3_1r module[1] type: " << module->name() << std::endl; } // Get children of conv3_1t std::cout << "conv3_1t has " << conv3_1t->size() << " modules" << std::endl; if (conv3_1t->size() > 1) { auto module = conv3_1t[1]; std::cout << "conv3_1t module[1] type: " << module->name() << std::endl; } // Get children of conv3_2t std::cout << "conv3_2t has " << conv3_2t->size() << " modules" << std::endl; if (conv3_2t->size() > 1) { auto module = conv3_2t[1]; std::cout << "conv3_2t module[1] type: " << module->name() << std::endl; } } // Helper function to read file to bytes std::vector BBRegressor::read_file_to_bytes(const std::string& file_path) { std::ifstream file(file_path, std::ios::binary | std::ios::ate); if (!file.is_open()) { throw std::runtime_error("Could not open file: " + file_path); } std::streamsize size = file.tellg(); file.seekg(0, std::ios::beg); std::vector buffer(size); if (!file.read(buffer.data(), size)) { throw std::runtime_error("Could not read file: " + file_path); } return buffer; } // Load tensor from file torch::Tensor BBRegressor::load_tensor(const std::string& file_path) { try { // Read file into bytes first std::vector data = read_file_to_bytes(file_path); // Use pickle_load with byte data torch::Tensor tensor = torch::pickle_load(data).toTensor(); // Always move tensor to the specified device return tensor.to(device); } catch (const c10::Error& e) { std::cerr << "Error loading tensor from " << file_path << ": " << e.what() << std::endl; throw; } } // Constructor BBRegressor::BBRegressor(const std::string& base_dir, torch::Device dev) : device(dev), model_dir(base_dir + "/exported_weights/bb_regressor"), fc3_rt(256, 256, 5, true, true, true), fc4_rt(256, 256, 3, true, true, true) { // Check if base directory exists if (!fs::exists(base_dir)) { throw std::runtime_error("Base directory does not exist: " + base_dir); } // Check if model directory exists if (!fs::exists(model_dir)) { throw std::runtime_error("Model directory does not exist: " + model_dir); } // Initialize convolution blocks - match Python's AtomIoUNet implementation exactly std::cout << "Initializing conv blocks..." << std::endl; // In Python: self.conv3_1r = conv(input_dim[0], 128, kernel_size=3, stride=1) conv3_1r = create_conv_block(512, 128, 3, 1, 1, 1); // In Python: self.conv3_1t = conv(input_dim[0], 256, kernel_size=3, stride=1) conv3_1t = create_conv_block(512, 256, 3, 1, 1, 1); // In Python: self.conv3_2t = conv(256, pred_input_dim[0], kernel_size=3, stride=1) conv3_2t = create_conv_block(256, 256, 3, 1, 1, 1); // Update pooling sizes to match the Python model exactly // In Python: self.prroi_pool3r = PrRoIPool2D(3, 3, 1/8) prroi_pool3r = std::make_shared(3, 3, 0.125); // 1/8 scale for layer2 // In Python: self.prroi_pool3t = PrRoIPool2D(5, 5, 1/8) prroi_pool3t = std::make_shared(5, 5, 0.125); // 1/8 scale for layer2 // Create sequential blocks // In Python: self.fc3_1r = conv(128, 256, kernel_size=3, stride=1, padding=0) fc3_1r = create_conv_block(128, 256, 3, 1, 0, 1); // padding=0 for this layer // In Python: self.conv4_1r = conv(input_dim[1], 256, kernel_size=3, stride=1) conv4_1r = create_conv_block(1024, 256, 3, 1, 1, 1); // In Python: self.conv4_1t = conv(input_dim[1], 256, kernel_size=3, stride=1) conv4_1t = create_conv_block(1024, 256, 3, 1, 1, 1); // In Python: self.conv4_2t = conv(256, pred_input_dim[1], kernel_size=3, stride=1) conv4_2t = create_conv_block(256, 256, 3, 1, 1, 1); // In Python: self.prroi_pool4r = PrRoIPool2D(1, 1, 1/16) prroi_pool4r = std::make_shared(1, 1, 0.0625); // 1/16 scale for layer3 // In Python: self.prroi_pool4t = PrRoIPool2D(3, 3, 1/16) prroi_pool4t = std::make_shared(3, 3, 0.0625); // 1/16 scale for layer3 // In Python: self.fc34_3r = conv(256 + 256, pred_input_dim[0], kernel_size=1, stride=1, padding=0) fc34_3r = create_conv_block(512, 256, 1, 1, 0, 1); // kernel_size=1, padding=0 // In Python: self.fc34_4r = conv(256 + 256, pred_input_dim[1], kernel_size=1, stride=1, padding=0) fc34_4r = create_conv_block(512, 256, 1, 1, 0, 1); // kernel_size=1, padding=0 // Linear blocks - exactly match Python's implementation dimensions and parameters // In Python: self.fc3_rt = LinearBlock(pred_input_dim[0], pred_inter_dim[0], 5) fc3_rt = LinearBlock(256, 256, 5, true, true, true); // In Python: self.fc4_rt = LinearBlock(pred_input_dim[1], pred_inter_dim[1], 3) fc4_rt = LinearBlock(256, 256, 3, true, true, true); // In Python: self.iou_predictor = nn.Linear(pred_inter_dim[0]+pred_inter_dim[1], 1, bias=True) iou_predictor = torch::nn::Linear(torch::nn::LinearOptions(256 + 256, 1).bias(true)); // Load all weights load_weights(); // Set the model to evaluation mode this->eval(); // Debug information std::cout << "BB Regressor initialized in evaluation mode" << std::endl; } // Set the model to evaluation mode void BBRegressor::eval() { // Set all sequential modules to eval mode conv3_1r->eval(); conv3_1t->eval(); conv3_2t->eval(); fc3_1r->eval(); conv4_1r->eval(); conv4_1t->eval(); conv4_2t->eval(); fc34_3r->eval(); fc34_4r->eval(); // Linear blocks also need to be set to eval mode for BatchNorm layers fc3_rt.eval(); fc4_rt.eval(); // Set linear layers to eval mode (though this usually doesn't have any effect) iou_predictor->eval(); } // Load weights void BBRegressor::load_weights() { // Helper lambda to load weights for a sequential module auto load_sequential_weights = [this](torch::nn::Sequential& seq, const std::string& prefix) { try { // Load weights for conv layer (index 0) std::string weight_path = model_dir + "/" + prefix + "_0_weight.pt"; std::string bias_path = model_dir + "/" + prefix + "_0_bias.pt"; if (fs::exists(weight_path) && fs::exists(bias_path)) { auto conv_weight = load_tensor(weight_path); auto conv_bias = load_tensor(bias_path); // Get the conv2d module from sequential // Fix: Get the number of output channels from the weight tensor int out_channels = conv_weight.size(0); int in_channels = conv_weight.size(1); int kernel_size = conv_weight.size(2); std::cout << "Loading " << prefix << " conv weights: " << "[out_ch=" << out_channels << ", in_ch=" << in_channels << ", kernel=" << kernel_size << "]" << std::endl; // FIXED: Use the correct padding based on the layer name int padding = 1; // Default padding // Special cases for layers with different padding if (prefix == "fc3_1r" || prefix == "fc34_3r" || prefix == "fc34_4r") { padding = 0; // These layers use padding=0 in the Python implementation } std::cout << " Using padding=" << padding << " for " << prefix << std::endl; auto conv_options = torch::nn::Conv2dOptions(in_channels, out_channels, kernel_size) .stride(1).padding(padding).bias(true); auto conv_module = torch::nn::Conv2d(conv_options); // Set weights and bias conv_module->weight = conv_weight; conv_module->bias = conv_bias; // Debug info - print some weight stats std::cout << " Conv weight stats: mean=" << conv_weight.mean().item() << ", std=" << conv_weight.std().item() << ", min=" << conv_weight.min().item() << ", max=" << conv_weight.max().item() << std::endl; // Create a new sequence with the proper conv module auto new_seq = torch::nn::Sequential(); new_seq->push_back(conv_module); // Load batch norm parameters (index 1) std::string bn_weight_path = model_dir + "/" + prefix + "_1_weight.pt"; std::string bn_bias_path = model_dir + "/" + prefix + "_1_bias.pt"; std::string bn_mean_path = model_dir + "/" + prefix + "_1_running_mean.pt"; std::string bn_var_path = model_dir + "/" + prefix + "_1_running_var.pt"; if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) && fs::exists(bn_mean_path) && fs::exists(bn_var_path)) { auto bn_weight = load_tensor(bn_weight_path); auto bn_bias = load_tensor(bn_bias_path); auto bn_mean = load_tensor(bn_mean_path); auto bn_var = load_tensor(bn_var_path); // Important: Create BatchNorm with the correct number of features from the weights int num_features = bn_weight.size(0); std::cout << " Creating BatchNorm2d with num_features=" << num_features << std::endl; // Create a proper batch norm module with the right number of features auto bn_options = torch::nn::BatchNorm2dOptions(num_features) .eps(1e-5) // Match Python default .momentum(0.1) // Match Python default .affine(true) .track_running_stats(true); auto bn_module = torch::nn::BatchNorm2d(bn_options); // Set batch norm parameters bn_module->weight = bn_weight; bn_module->bias = bn_bias; bn_module->running_mean = bn_mean; bn_module->running_var = bn_var; // Debug info - print some batch norm stats std::cout << " BN weight stats: mean=" << bn_weight.mean().item() << ", std=" << bn_weight.std().item() << std::endl; std::cout << " BN running_mean stats: mean=" << bn_mean.mean().item() << ", std=" << bn_mean.std().item() << std::endl; std::cout << " BN running_var stats: mean=" << bn_var.mean().item() << ", std=" << bn_var.std().item() << std::endl; // Add the batch norm module to the sequence new_seq->push_back(bn_module); } // Add the ReLU module with inplace=true to match Python auto relu_options = torch::nn::ReLUOptions().inplace(true); new_seq->push_back(torch::nn::ReLU(relu_options)); // Replace the old sequence with the new one seq = new_seq; std::cout << "Loaded weights for " << prefix << std::endl; } else { std::cerr << "Weight files not found for " << prefix << std::endl; } } catch (const std::exception& e) { std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl; throw; // Re-throw to stop execution } }; // Load weights for linear blocks auto load_linear_block_weights = [this](LinearBlock& block, const std::string& prefix) { try { // Load weights for linear layer std::string weight_path = model_dir + "/" + prefix + "_linear_weight.pt"; std::string bias_path = model_dir + "/" + prefix + "_linear_bias.pt"; if (fs::exists(weight_path) && fs::exists(bias_path)) { auto linear_weight = load_tensor(weight_path); auto linear_bias = load_tensor(bias_path); // Set weights and bias block.linear->weight = linear_weight; block.linear->bias = linear_bias; // Load batch norm parameters std::string bn_weight_path = model_dir + "/" + prefix + "_bn_weight.pt"; std::string bn_bias_path = model_dir + "/" + prefix + "_bn_bias.pt"; std::string bn_mean_path = model_dir + "/" + prefix + "_bn_running_mean.pt"; std::string bn_var_path = model_dir + "/" + prefix + "_bn_running_var.pt"; if (fs::exists(bn_weight_path) && fs::exists(bn_bias_path) && fs::exists(bn_mean_path) && fs::exists(bn_var_path)) { auto bn_weight = load_tensor(bn_weight_path); auto bn_bias = load_tensor(bn_bias_path); auto bn_mean = load_tensor(bn_mean_path); auto bn_var = load_tensor(bn_var_path); // Set batch norm parameters block.bn->weight = bn_weight; block.bn->bias = bn_bias; block.bn->running_mean = bn_mean; block.bn->running_var = bn_var; } std::cout << "Loaded weights for " << prefix << std::endl; } else { std::cerr << "Weight files not found for " << prefix << std::endl; } } catch (const std::exception& e) { std::cerr << "Error loading weights for " << prefix << ": " << e.what() << std::endl; throw; // Re-throw to stop execution } }; // Load weights for all layers load_sequential_weights(conv3_1r, "conv3_1r"); load_sequential_weights(conv3_1t, "conv3_1t"); load_sequential_weights(conv3_2t, "conv3_2t"); load_sequential_weights(fc3_1r, "fc3_1r"); load_sequential_weights(conv4_1r, "conv4_1r"); load_sequential_weights(conv4_1t, "conv4_1t"); load_sequential_weights(conv4_2t, "conv4_2t"); load_sequential_weights(fc34_3r, "fc34_3r"); load_sequential_weights(fc34_4r, "fc34_4r"); load_linear_block_weights(fc3_rt, "fc3_rt"); load_linear_block_weights(fc4_rt, "fc4_rt"); // Load IoU predictor weights try { std::string weight_path = model_dir + "/iou_predictor_weight.pt"; std::string bias_path = model_dir + "/iou_predictor_bias.pt"; if (fs::exists(weight_path) && fs::exists(bias_path)) { auto weight = load_tensor(weight_path); auto bias = load_tensor(bias_path); iou_predictor->weight = weight; iou_predictor->bias = bias; std::cout << "Loaded weights for iou_predictor" << std::endl; } else { std::cerr << "Weight files not found for iou_predictor" << std::endl; } } catch (const std::exception& e) { std::cerr << "Error loading weights for iou_predictor: " << e.what() << std::endl; throw; // Re-throw to stop execution } } // Move model to device void BBRegressor::to(torch::Device device) { // Verify the device is a CUDA device if (!device.is_cuda()) { throw std::runtime_error("BBRegressor requires a CUDA device"); } this->device = device; // Move all components to device conv3_1r->to(device); conv3_1t->to(device); conv3_2t->to(device); fc3_1r->to(device); conv4_1r->to(device); conv4_1t->to(device); conv4_2t->to(device); fc3_rt.to(device); fc4_rt.to(device); iou_predictor->to(device); } // Get IoU features from backbone features std::vector BBRegressor::get_iou_feat(std::vector feat2) { // Convert to double precision for better numerical stability auto feat2_double0 = feat2[0].to(torch::kFloat64); auto feat2_double1 = feat2[1].to(torch::kFloat64); // Reshape exactly as in Python implementation // In Python: feat2 = [f.reshape(-1, *f.shape[-3:]) if f.dim()==5 else f for f in feat2] if (feat2_double0.dim() == 5) { auto shape = feat2_double0.sizes(); feat2_double0 = feat2_double0.reshape({-1, shape[2], shape[3], shape[4]}).contiguous(); } if (feat2_double1.dim() == 5) { auto shape = feat2_double1.sizes(); feat2_double1 = feat2_double1.reshape({-1, shape[2], shape[3], shape[4]}).contiguous(); } // Convert back to float32 for convolution operations feat2[0] = feat2_double0.to(torch::kFloat32).contiguous(); feat2[1] = feat2_double1.to(torch::kFloat32).contiguous(); // Apply convolutions exactly as in Python torch::Tensor feat3_t = feat2[0]; torch::Tensor feat4_t = feat2[1]; // Ensure we're in evaluation mode torch::NoGradGuard no_grad; // Apply convolutions just like Python version torch::Tensor c3_t_1 = conv3_1t->forward(feat3_t); c3_t_1 = c3_t_1.contiguous(); torch::Tensor c3_t = conv3_2t->forward(c3_t_1); c3_t = c3_t.contiguous(); torch::Tensor c4_t_1 = conv4_1t->forward(feat4_t); c4_t_1 = c4_t_1.contiguous(); torch::Tensor c4_t = conv4_2t->forward(c4_t_1); c4_t = c4_t.contiguous(); // Return results return {c3_t, c4_t}; } // Get modulation vectors for the target std::vector BBRegressor::get_modulation(std::vector feat, torch::Tensor bb) { // Apply target branch to get modulation vectors std::cout << " get_modulation input bb: " << bb.sizes() << std::endl; // Convert bounding box from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format for ROI pooling auto roi = torch::zeros({bb.size(0), 5}, bb.options()); // Set batch index to 0 (first element) roi.index_put_({torch::indexing::Slice(), 0}, 0); // Copy x, y coordinates roi.index_put_({torch::indexing::Slice(), 1}, bb.index({torch::indexing::Slice(), 0})); roi.index_put_({torch::indexing::Slice(), 2}, bb.index({torch::indexing::Slice(), 1})); // Calculate x2, y2 from width and height auto x2 = bb.index({torch::indexing::Slice(), 0}) + bb.index({torch::indexing::Slice(), 2}); auto y2 = bb.index({torch::indexing::Slice(), 1}) + bb.index({torch::indexing::Slice(), 3}); roi.index_put_({torch::indexing::Slice(), 3}, x2); roi.index_put_({torch::indexing::Slice(), 4}, y2); std::cout << " Converted ROI: ["; for (int i = 0; i < roi.size(1); i++) { std::cout << roi[0][i].item(); if (i < roi.size(1) - 1) std::cout << ", "; } std::cout << "]" << std::endl; // Apply target branch to get modulation vectors auto feat1 = conv3_1t->forward(feat[0]); auto feat2 = conv3_2t->forward(feat1); // Apply target branch to get modulation vectors for second feature map auto feat3 = conv4_1t->forward(feat[1]); auto feat4 = conv4_2t->forward(feat3); // ROI pool the features - use the same ROI for both feature maps std::cout << " Applying ROI pooling to layer 3..." << std::endl; auto pooled_feat1 = prroi_pool3t->forward(feat2, roi); std::cout << " Applying ROI pooling to layer 4..." << std::endl; auto pooled_feat2 = prroi_pool4t->forward(feat4, roi); // Flatten and concatenate the pooled features auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1}); auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1}); // Apply fully connected layer to get modulation vectors auto modulation1 = fc3_rt.forward(vec1); auto modulation2 = fc4_rt.forward(vec2); // Return modulation vectors return {modulation1, modulation2}; } // Predict IoU for proposals torch::Tensor BBRegressor::predict_iou(std::vector modulation, std::vector feat, torch::Tensor proposals) { // Debug dimensions std::cout << "Input dimensions:" << std::endl; std::cout << " modulation[0]: [" << modulation[0].size(0) << ", " << modulation[0].size(1) << "]" << std::endl; std::cout << " modulation[1]: [" << modulation[1].size(0) << ", " << modulation[1].size(1) << "]" << std::endl; std::cout << " feat[0]: [" << feat[0].size(0) << ", " << feat[0].size(1) << ", " << feat[0].size(2) << ", " << feat[0].size(3) << "]" << std::endl; std::cout << " feat[1]: [" << feat[1].size(0) << ", " << feat[1].size(1) << ", " << feat[1].size(2) << ", " << feat[1].size(3) << "]" << std::endl; std::cout << " proposals: [" << proposals.size(0) << ", " << proposals.size(1) << ", " << proposals.size(2) << "]" << std::endl; // Convert proposals from [batch, num_proposals, 4] to [num_proposals, 5] format // with batch index as the first element auto batch_size = proposals.size(0); auto num_proposals = proposals.size(1); // Reshape proposals to [num_proposals, 4] auto proposals_view = proposals.reshape({-1, 4}); // Create batch indices tensor [0, 0, 0, ...] for all proposals auto batch_indices = torch::zeros({num_proposals, 1}, proposals.options()); // Convert proposals from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format auto roi = torch::zeros({num_proposals, 5}, proposals.options()); roi.index_put_({torch::indexing::Slice(), 0}, batch_indices.squeeze()); roi.index_put_({torch::indexing::Slice(), 1}, proposals_view.index({torch::indexing::Slice(), 0})); roi.index_put_({torch::indexing::Slice(), 2}, proposals_view.index({torch::indexing::Slice(), 1})); // Calculate x2, y2 from width and height auto x2 = proposals_view.index({torch::indexing::Slice(), 0}) + proposals_view.index({torch::indexing::Slice(), 2}); auto y2 = proposals_view.index({torch::indexing::Slice(), 1}) + proposals_view.index({torch::indexing::Slice(), 3}); roi.index_put_({torch::indexing::Slice(), 3}, x2); roi.index_put_({torch::indexing::Slice(), 4}, y2); // Make sure ROI is on the same device as features torch::Device feat_device = feat[0].device(); roi = roi.to(feat_device); // Apply ROI pooling to get features for each proposal auto pooled_feat1 = prroi_pool3r->forward(feat[0], roi); auto pooled_feat2 = prroi_pool4r->forward(feat[1], roi); // Make sure all tensors are on the same device (GPU) torch::Device target_device = modulation[0].device(); pooled_feat1 = pooled_feat1.to(target_device); pooled_feat2 = pooled_feat2.to(target_device); // Print intermediate tensor shapes std::cout << " Pooled shapes:" << std::endl; std::cout << " pooled_feat1: [" << pooled_feat1.size(0) << ", " << pooled_feat1.size(1) << ", " << pooled_feat1.size(2) << ", " << pooled_feat1.size(3) << "]" << std::endl; std::cout << " pooled_feat2: [" << pooled_feat2.size(0) << ", " << pooled_feat2.size(1) << ", " << pooled_feat2.size(2) << ", " << pooled_feat2.size(3) << "]" << std::endl; // Inspect the IoU predictor dimensions std::cout << " IoU predictor dimensions:" << std::endl; std::cout << " weight: [" << iou_predictor->weight.size(0) << ", " << iou_predictor->weight.size(1) << "]" << std::endl; std::cout << " bias: [" << iou_predictor->bias.size(0) << "]" << std::endl; try { // Flatten pooled features auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1}); auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1}); // Print flattened shapes std::cout << " Flattened shapes:" << std::endl; std::cout << " vec1: [" << vec1.size(0) << ", " << vec1.size(1) << "]" << std::endl; std::cout << " vec2: [" << vec2.size(0) << ", " << vec2.size(1) << "]" << std::endl; // We need to adapt the input to match what the IoU predictor expects // The IoU predictor has a weight matrix of size 512x1, so input should have 512 features // Instead of concatenating the full features, we need to first reduce them to match expected size // This is based on the original Python implementation // Get modulation shapes std::cout << " Modulation vector shapes:" << std::endl; std::cout << " mod1: [" << modulation[0].size(0) << ", " << modulation[0].size(1) << "]" << std::endl; std::cout << " mod2: [" << modulation[1].size(0) << ", " << modulation[1].size(1) << "]" << std::endl; // Calculate expected dimensions int mod1_dim = modulation[0].size(1); // Should be 256 int mod2_dim = modulation[1].size(1); // Should be 256 int total_mod_dim = mod1_dim + mod2_dim; // Should be 512, matching iou_predictor weight row count std::cout << " Using correct input dimensions for IoU predictor (total_dim=" << total_mod_dim << ")" << std::endl; // Create processed features with correct dimensions auto processed_feat1 = torch::zeros({num_proposals, mod1_dim}, vec1.options()); auto processed_feat2 = torch::zeros({num_proposals, mod2_dim}, vec2.options()); // We need to reduce the dimensionality of vec1 and vec2 to match mod1_dim and mod2_dim // We'll use average pooling across spatial dimensions if (vec1.size(1) > mod1_dim) { // Average every N values to reduce dimension int pool_size = vec1.size(1) / mod1_dim; std::cout << " Reducing vec1 features with pool_size=" << pool_size << std::endl; for (int i = 0; i < num_proposals; i++) { for (int j = 0; j < mod1_dim; j++) { float sum = 0.0f; for (int k = 0; k < pool_size; k++) { int idx = j * pool_size + k; if (idx < vec1.size(1)) { sum += vec1[i][idx].item(); } } processed_feat1[i][j] = sum / pool_size; } } } else { // Just copy directly if dimensions already match processed_feat1 = vec1; } if (vec2.size(1) > mod2_dim) { // Similar reduction for vec2 int pool_size = vec2.size(1) / mod2_dim; std::cout << " Reducing vec2 features with pool_size=" << pool_size << std::endl; for (int i = 0; i < num_proposals; i++) { for (int j = 0; j < mod2_dim; j++) { float sum = 0.0f; for (int k = 0; k < pool_size; k++) { int idx = j * pool_size + k; if (idx < vec2.size(1)) { sum += vec2[i][idx].item(); } } processed_feat2[i][j] = sum / pool_size; } } } else { // Just copy directly if dimensions already match processed_feat2 = vec2; } // Prepare modulation vectors for each proposal auto mod1 = modulation[0].repeat({num_proposals, 1}); auto mod2 = modulation[1].repeat({num_proposals, 1}); std::cout << " Final feature shapes:" << std::endl; std::cout << " processed_feat1: [" << processed_feat1.size(0) << ", " << processed_feat1.size(1) << "]" << std::endl; std::cout << " processed_feat2: [" << processed_feat2.size(0) << ", " << processed_feat2.size(1) << "]" << std::endl; std::cout << " mod1: [" << mod1.size(0) << ", " << mod1.size(1) << "]" << std::endl; std::cout << " mod2: [" << mod2.size(0) << ", " << mod2.size(1) << "]" << std::endl; // Element-wise multiply features with modulation vectors auto mod_feat1 = processed_feat1 * mod1; auto mod_feat2 = processed_feat2 * mod2; // Concatenate to get final features for IoU prediction auto ioufeat = torch::cat({mod_feat1, mod_feat2}, /*dim=*/1); std::cout << " ioufeat shape: [" << ioufeat.size(0) << ", " << ioufeat.size(1) << "]" << std::endl; // Apply IoU predictor std::cout << " Applying IoU predictor" << std::endl; auto iou_scores = iou_predictor->forward(ioufeat); std::cout << " iou_scores raw shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl; // Reshape back to [batch_size, num_proposals] iou_scores = iou_scores.reshape({batch_size, num_proposals}); std::cout << " Final iou_scores shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl; return iou_scores; } catch (const std::exception& e) { std::cerr << "Error in predict_iou: " << e.what() << std::endl; // Create a fallback that won't crash, but report the error clearly std::cout << "CRITICAL ERROR: IoU prediction failed, returning constant scores" << std::endl; auto options = torch::TensorOptions().dtype(proposals.dtype()).device(proposals.device()); auto iou_scores = torch::ones({batch_size, num_proposals}, options) * 0.5; return iou_scores; } } // Print model information void BBRegressor::print_model_info() { std::cout << "BBRegressor Model Information:" << std::endl; std::cout << " - Model directory: " << model_dir << std::endl; std::cout << " - Device: CUDA:" << device.index() << std::endl; std::cout << " - CUDA Device Count: " << torch::cuda::device_count() << std::endl; std::cout << " - Using PreciseRoIPooling: " << #ifdef WITH_PRROI_POOLING "Yes" #else "No (will fail)" #endif << std::endl; } // Compute statistics for a tensor BBRegressor::TensorStats BBRegressor::compute_stats(const torch::Tensor& tensor) { TensorStats stats; // Get shape for (int i = 0; i < tensor.dim(); i++) { stats.shape.push_back(tensor.size(i)); } // Compute basic stats - make sure we reduce to scalar values stats.mean = tensor.mean().item(); // Mean of all elements stats.std_dev = tensor.std().item(); // Std dev of all elements stats.min_val = tensor.min().item(); // Min of all elements stats.max_val = tensor.max().item(); // Max of all elements stats.sum = tensor.sum().item(); // Sum of all elements // Sample values at specific positions if (tensor.dim() >= 4) { // For 4D tensors (batch, channel, height, width) stats.samples.push_back(tensor.index({0, 0, 0, 0}).item()); if (tensor.size(1) > 1 && tensor.size(2) > 1 && tensor.size(3) > 1) { int mid_c = static_cast(tensor.size(1) / 2); int mid_h = static_cast(tensor.size(2) / 2); int mid_w = static_cast(tensor.size(3) / 2); stats.samples.push_back(tensor.index({0, mid_c, mid_h, mid_w}).item()); // Use static_cast to convert int64_t to int to avoid type mismatch int64_t last_c_idx = tensor.size(1) - 1; int64_t last_h_idx = tensor.size(2) - 1; int64_t last_w_idx = tensor.size(3) - 1; // Limit indices to avoid accessing out of bounds if (last_c_idx > 10) last_c_idx = 10; if (last_h_idx > 10) last_h_idx = 10; if (last_w_idx > 10) last_w_idx = 10; stats.samples.push_back(tensor.index({0, static_cast(last_c_idx), static_cast(last_h_idx), static_cast(last_w_idx)}).item()); } } else if (tensor.dim() == 3) { // For 3D tensors stats.samples.push_back(tensor.index({0, 0, 0}).item()); if (tensor.size(1) > 1 && tensor.size(2) > 1) { int mid_h = static_cast(tensor.size(1) / 2); int mid_w = static_cast(tensor.size(2) / 2); stats.samples.push_back(tensor.index({0, mid_h, mid_w}).item()); int last_h = static_cast(tensor.size(1) - 1); int last_w = static_cast(tensor.size(2) - 1); stats.samples.push_back(tensor.index({0, last_h, last_w}).item()); } } else if (tensor.dim() == 2) { // For 2D tensors stats.samples.push_back(tensor.index({0, 0}).item()); if (tensor.size(0) > 1 && tensor.size(1) > 1) { int mid_h = static_cast(tensor.size(0) / 2); int mid_w = static_cast(tensor.size(1) / 2); stats.samples.push_back(tensor.index({mid_h, mid_w}).item()); int last_h = static_cast(tensor.size(0) - 1); int last_w = static_cast(tensor.size(1) - 1); stats.samples.push_back(tensor.index({last_h, last_w}).item()); } } else { // For 1D tensors or scalars if (tensor.numel() > 0) { stats.samples.push_back(tensor.index({0}).item()); if (tensor.size(0) > 1) { int mid = static_cast(tensor.size(0) / 2); stats.samples.push_back(tensor.index({mid}).item()); int last = static_cast(tensor.size(0) - 1); stats.samples.push_back(tensor.index({last}).item()); } } } return stats; } // Save tensor statistics to a file void BBRegressor::save_stats(const std::vector& all_stats, const std::string& filepath) { std::ofstream file(filepath); if (!file.is_open()) { std::cerr << "Error opening file for writing: " << filepath << std::endl; return; } for (size_t i = 0; i < all_stats.size(); i++) { const auto& stats = all_stats[i]; file << "Output " << i << ":" << std::endl; file << " Shape: ["; for (size_t j = 0; j < stats.shape.size(); j++) { file << stats.shape[j]; if (j < stats.shape.size() - 1) file << ", "; } file << "]" << std::endl; file << " Mean: " << stats.mean << std::endl; file << " Std: " << stats.std_dev << std::endl; file << " Min: " << stats.min_val << std::endl; file << " Max: " << stats.max_val << std::endl; file << " Sum: " << stats.sum << std::endl; file << " Sample values: ["; for (size_t j = 0; j < stats.samples.size(); j++) { file << stats.samples[j]; if (j < stats.samples.size() - 1) file << ", "; } file << "]" << std::endl << std::endl; } file.close(); }