diff --git a/cimp/bb_regressor/bb_regressor.cpp b/cimp/bb_regressor/bb_regressor.cpp index e58692e..2f39bd0 100644 --- a/cimp/bb_regressor/bb_regressor.cpp +++ b/cimp/bb_regressor/bb_regressor.cpp @@ -5,18 +5,15 @@ #include #include #include -// Add CUDA includes and external function declarations only if not in CPU_ONLY mode -#ifndef CPU_ONLY -// Add CUDA includes +// Add CUDA includes for required CUDA implementation #include #include -// Use the new PrRoIPooling implementation +// Use the PrRoIPooling implementation #include "prroi_pooling_gpu.h" #include "prroi_pooling_gpu_impl.cuh" -#endif -// PrRoIPool2D implementation with CPU fallback +// PrRoIPool2D implementation (requires CUDA) PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale) : pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale) {} @@ -33,82 +30,50 @@ torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) { int channels = feat.size(1); int num_rois = rois.size(0); - // Create output tensor - auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_}, - feat.options()); - - // Use a simple average pooling as fallback - for (int n = 0; n < num_rois; n++) { - // Get ROI coordinates (batch_idx, x1, y1, x2, y2) - int roi_batch_idx = static_cast(rois[n][0].item()); - float roi_x1 = rois[n][1].item() * spatial_scale_; - float roi_y1 = rois[n][2].item() * spatial_scale_; - float roi_x2 = rois[n][3].item() * spatial_scale_; - float roi_y2 = rois[n][4].item() * spatial_scale_; - - // Skip invalid ROIs - if (roi_batch_idx < 0) continue; - - // Force ROI bounds within image - int img_height = feat.size(2); - int img_width = feat.size(3); - - roi_x1 = std::max(0.0f, std::min(static_cast(img_width - 1), roi_x1)); - roi_y1 = std::max(0.0f, std::min(static_cast(img_height - 1), roi_y1)); - roi_x2 = std::max(0.0f, std::min(static_cast(img_width - 1), roi_x2)); - roi_y2 = std::max(0.0f, std::min(static_cast(img_height - 1), roi_y2)); - - // Convert to integers for pooling - int x1 = static_cast(roi_x1); - int y1 = static_cast(roi_y1); - int x2 = static_cast(ceil(roi_x2)); - int y2 = static_cast(ceil(roi_y2)); - - // Calculate bin sizes - float bin_width = (roi_x2 - roi_x1) / pooled_width_; - float bin_height = (roi_y2 - roi_y1) / pooled_height_; - - // Perform pooling for each output location - for (int ph = 0; ph < pooled_height_; ph++) { - for (int pw = 0; pw < pooled_width_; pw++) { - // Compute bin boundaries - int hstart = static_cast(roi_y1 + ph * bin_height); - int wstart = static_cast(roi_x1 + pw * bin_width); - int hend = static_cast(ceil(roi_y1 + (ph + 1) * bin_height)); - int wend = static_cast(ceil(roi_x1 + (pw + 1) * bin_width)); - - // Clip to image boundaries - hstart = std::max(0, std::min(img_height - 1, hstart)); - wstart = std::max(0, std::min(img_width - 1, wstart)); - hend = std::max(0, std::min(img_height, hend)); - wend = std::max(0, std::min(img_width, wend)); - - // Skip empty bins - if (hend <= hstart || wend <= wstart) continue; - - // Calculate pool size - int pool_size = (hend - hstart) * (wend - wstart); - - // For each channel, perform pooling - for (int c = 0; c < channels; c++) { - float sum = 0.0f; - - // Sum over the bin area - for (int h = hstart; h < hend; h++) { - for (int w = wstart; w < wend; w++) { - sum += feat[roi_batch_idx][c][h][w].item(); - } - } - - // Average pooling - if (pool_size > 0) { - output[n][c][ph][pw] = sum / pool_size; - } - } - } + // Ensure both tensors are on CUDA + if (!feat.is_cuda() || !rois.is_cuda()) { + throw std::runtime_error("PrRoIPool2D requires CUDA tensors - CPU mode is not supported"); + } + + // Print ROI values for debugging + std::cout << " ROI values: " << std::endl; + for (int i = 0; i < std::min(num_rois, 3); i++) { + std::cout << " ROI " << i << ": ["; + for (int j = 0; j < rois.size(1); j++) { + std::cout << rois[i][j].item(); + if (j < rois.size(1) - 1) std::cout << ", "; } + std::cout << "]" << std::endl; } + // Create output tensor on the same device + auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_}, + feat.options()); + + // Copy tensors to CPU for the C implementation + auto feat_cpu = feat.to(torch::kCPU).contiguous(); + auto rois_cpu = rois.to(torch::kCPU).contiguous(); + auto output_cpu = output.to(torch::kCPU).contiguous(); + + // Call the C wrapper function + std::cout << " Calling prroi_pooling_forward_cuda..." << std::endl; + prroi_pooling_forward_cuda( + feat_cpu.data_ptr(), + static_cast(rois_cpu.data_ptr()), + static_cast(output_cpu.data_ptr()), + channels, + feat.size(2), + feat.size(3), + num_rois, + pooled_height_, + pooled_width_, + spatial_scale_ + ); + std::cout << " prroi_pooling_forward_cuda completed" << std::endl; + + // Copy result back to GPU + output.copy_(output_cpu); + return output; } @@ -248,12 +213,8 @@ torch::Tensor BBRegressor::load_tensor(const std::string& file_path) { torch::Tensor tensor = torch::pickle_load(data).toTensor(); // Always move tensor to the specified device - if (tensor.device() != device) { - tensor = tensor.to(device); - } - - return tensor; - } catch (const std::exception& e) { + return tensor.to(device); + } catch (const c10::Error& e) { std::cerr << "Error loading tensor from " << file_path << ": " << e.what() << std::endl; throw; } @@ -625,196 +586,132 @@ std::vector BBRegressor::get_iou_feat(std::vector // Get modulation vectors for the target std::vector BBRegressor::get_modulation(std::vector feat, torch::Tensor bb) { - // Convert to double precision for better numerical stability - auto feat0_double = feat[0].to(torch::kFloat64); - auto feat1_double = feat[1].to(torch::kFloat64); - auto bb_double = bb.to(torch::kFloat64); - - // Handle 5D tensors exactly like Python implementation - if (feat0_double.dim() == 5) { - auto shape = feat0_double.sizes(); - feat0_double = feat0_double.reshape({-1, shape[2], shape[3], shape[4]}).contiguous(); - } - - if (feat1_double.dim() == 5) { - auto shape = feat1_double.sizes(); - feat1_double = feat1_double.reshape({-1, shape[2], shape[3], shape[4]}).contiguous(); - } - - // Convert back to float32 for convolution operations - feat[0] = feat0_double.to(torch::kFloat32).contiguous(); - feat[1] = feat1_double.to(torch::kFloat32).contiguous(); - bb = bb_double.to(torch::kFloat32).contiguous(); - - torch::Tensor feat3_r = feat[0]; - torch::Tensor feat4_r = feat[1]; - - // Disable gradients for evaluation - torch::NoGradGuard no_grad; + // Apply target branch to get modulation vectors + std::cout << " get_modulation input bb: " << bb.sizes() << std::endl; - // Apply convolutions - torch::Tensor c3_r = conv3_1r->forward(feat3_r); - c3_r = c3_r.contiguous(); + // Convert bounding box from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format for ROI pooling + auto roi = torch::zeros({bb.size(0), 5}, bb.options()); - // Convert bb from xywh to x0y0x1y1 format with high precision - auto bb_clone = bb.clone(); - bb_double = bb_clone.to(torch::kFloat64); - auto xy = bb_double.index({torch::indexing::Slice(), torch::indexing::Slice(0, 2)}); - auto wh = bb_double.index({torch::indexing::Slice(), torch::indexing::Slice(2, 4)}); - bb_double.index_put_({torch::indexing::Slice(), torch::indexing::Slice(2, 4)}, xy + wh); - bb_clone = bb_double.to(torch::kFloat32); + // Set batch index to 0 (first element) + roi.index_put_({torch::indexing::Slice(), 0}, 0); - // Add batch_index to rois - match Python implementation exactly - int batch_size = bb.size(0); - auto batch_index = torch::arange(batch_size, torch::kFloat32).reshape({-1, 1}).to(bb.device()); - auto roi1 = torch::cat({batch_index, bb_clone}, /*dim=*/1).contiguous(); + // Copy x, y coordinates + roi.index_put_({torch::indexing::Slice(), 1}, bb.index({torch::indexing::Slice(), 0})); + roi.index_put_({torch::indexing::Slice(), 2}, bb.index({torch::indexing::Slice(), 1})); - // Apply RoI pooling - torch::Tensor roi3r = prroi_pool3r->forward(c3_r, roi1); - roi3r = roi3r.contiguous(); + // Calculate x2, y2 from width and height + auto x2 = bb.index({torch::indexing::Slice(), 0}) + bb.index({torch::indexing::Slice(), 2}); + auto y2 = bb.index({torch::indexing::Slice(), 1}) + bb.index({torch::indexing::Slice(), 3}); + roi.index_put_({torch::indexing::Slice(), 3}, x2); + roi.index_put_({torch::indexing::Slice(), 4}, y2); - torch::Tensor c4_r = conv4_1r->forward(feat4_r); - c4_r = c4_r.contiguous(); + std::cout << " Converted ROI: ["; + for (int i = 0; i < roi.size(1); i++) { + std::cout << roi[0][i].item(); + if (i < roi.size(1) - 1) std::cout << ", "; + } + std::cout << "]" << std::endl; - torch::Tensor roi4r = prroi_pool4r->forward(c4_r, roi1); - roi4r = roi4r.contiguous(); + // Apply target branch to get modulation vectors + auto feat1 = conv3_1t->forward(feat[0]); + auto feat2 = conv3_2t->forward(feat1); - torch::Tensor fc3_r = fc3_1r->forward(roi3r); - fc3_r = fc3_r.contiguous(); + // Apply target branch to get modulation vectors for second feature map + auto feat3 = conv4_1t->forward(feat[1]); + auto feat4 = conv4_2t->forward(feat3); - // Concatenate with higher precision - auto fc3_r_double = fc3_r.to(torch::kFloat64); - auto roi4r_double = roi4r.to(torch::kFloat64); - auto fc34_r_double = torch::cat({fc3_r_double, roi4r_double}, /*dim=*/1); - auto fc34_r = fc34_r_double.to(torch::kFloat32).contiguous(); + // ROI pool the features - use the same ROI for both feature maps + std::cout << " Applying ROI pooling to layer 3..." << std::endl; + auto pooled_feat1 = prroi_pool3t->forward(feat2, roi); + std::cout << " Applying ROI pooling to layer 4..." << std::endl; + auto pooled_feat2 = prroi_pool4t->forward(feat4, roi); - // Apply final convolutions - torch::Tensor fc34_3_r = fc34_3r->forward(fc34_r); - fc34_3_r = fc34_3_r.contiguous(); + // Flatten and concatenate the pooled features + auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1}); + auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1}); - torch::Tensor fc34_4_r = fc34_4r->forward(fc34_r); - fc34_4_r = fc34_4_r.contiguous(); + // Apply fully connected layer to get modulation vectors + auto modulation1 = fc3_rt.forward(vec1); + auto modulation2 = fc4_rt.forward(vec2); - return {fc34_3_r, fc34_4_r}; + // Return modulation vectors + return {modulation1, modulation2}; } // Predict IoU for proposals torch::Tensor BBRegressor::predict_iou(std::vector modulation, - std::vector feat, - torch::Tensor proposals) { + std::vector feat, + torch::Tensor proposals) { try { - // Convert to double precision for better numerical stability - auto modulation0_double = modulation[0].to(torch::kFloat64); - auto modulation1_double = modulation[1].to(torch::kFloat64); - auto feat0_double = feat[0].to(torch::kFloat64); - auto feat1_double = feat[1].to(torch::kFloat64); - auto proposals_double = proposals.to(torch::kFloat64); - - // Extract modulation vectors and features - torch::Tensor fc34_3_r = modulation0_double; - torch::Tensor fc34_4_r = modulation1_double; - torch::Tensor c3_t = feat0_double; - torch::Tensor c4_t = feat1_double; + // Convert proposals from [batch, num_proposals, 4] to [num_proposals, 5] format + // with batch index as the first element + auto batch_size = proposals.size(0); + auto num_proposals = proposals.size(1); - // Ensure proper shapes with contiguous memory - fc34_3_r = fc34_3_r.contiguous(); - fc34_4_r = fc34_4_r.contiguous(); - c3_t = c3_t.contiguous(); - c4_t = c4_t.contiguous(); - proposals = proposals_double.to(torch::kFloat32).contiguous(); + // Reshape proposals to [num_proposals, 4] + auto proposals_view = proposals.reshape({-1, 4}); - int batch_size = c3_t.size(0); - int num_proposals_per_batch = proposals.size(1); + // Create batch indices tensor [0, 0, 0, ...] for all proposals + auto batch_indices = torch::zeros({num_proposals, 1}, proposals.options()); - // Reshape modulation vectors exactly like Python implementation - torch::Tensor fc34_3_r_reshaped; - if (fc34_3_r.dim() == 2) { - fc34_3_r_reshaped = fc34_3_r.reshape({batch_size, -1, 1, 1}); - } else if (fc34_3_r.dim() == 4) { - fc34_3_r_reshaped = fc34_3_r; - } else { - throw std::runtime_error("Unexpected modulation vector dimension: " + std::to_string(fc34_3_r.dim())); - } + // Convert proposals from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format + auto roi = torch::zeros({num_proposals, 5}, proposals.options()); + roi.index_put_({torch::indexing::Slice(), 0}, batch_indices.squeeze()); + roi.index_put_({torch::indexing::Slice(), 1}, proposals_view.index({torch::indexing::Slice(), 0})); + roi.index_put_({torch::indexing::Slice(), 2}, proposals_view.index({torch::indexing::Slice(), 1})); - torch::Tensor fc34_4_r_reshaped; - if (fc34_4_r.dim() == 2) { - fc34_4_r_reshaped = fc34_4_r.reshape({batch_size, -1, 1, 1}); - } else if (fc34_4_r.dim() == 4) { - fc34_4_r_reshaped = fc34_4_r; - } else { - throw std::runtime_error("Unexpected modulation vector dimension: " + std::to_string(fc34_4_r.dim())); - } + // Calculate x2, y2 from width and height + auto x2 = proposals_view.index({torch::indexing::Slice(), 0}) + proposals_view.index({torch::indexing::Slice(), 2}); + auto y2 = proposals_view.index({torch::indexing::Slice(), 1}) + proposals_view.index({torch::indexing::Slice(), 3}); + roi.index_put_({torch::indexing::Slice(), 3}, x2); + roi.index_put_({torch::indexing::Slice(), 4}, y2); - // Element-wise multiplication for modulation - auto c3_t_att_double = c3_t * fc34_3_r_reshaped; - auto c4_t_att_double = c4_t * fc34_4_r_reshaped; + // Make sure ROI is on the same device as features + torch::Device feat_device = feat[0].device(); + roi = roi.to(feat_device); - // Convert back to float32 for ROI pooling operations - auto c3_t_att = c3_t_att_double.to(torch::kFloat32).contiguous(); - auto c4_t_att = c4_t_att_double.to(torch::kFloat32).contiguous(); + // Apply ROI pooling to get features for each proposal + auto pooled_feat1 = prroi_pool3r->forward(feat[0], roi); + auto pooled_feat2 = prroi_pool4r->forward(feat[1], roi); - // Add batch index to ROIs - auto batch_index = torch::arange(batch_size, torch::kFloat32).reshape({-1, 1}).to(c3_t.device()); + // Make sure all tensors are on the same device (GPU) + torch::Device target_device = modulation[0].device(); + pooled_feat1 = pooled_feat1.to(target_device); + pooled_feat2 = pooled_feat2.to(target_device); - // Convert proposals from xywh to x0y0x1y1 format with high precision - proposals_double = proposals.to(torch::kFloat64); - auto proposals_xy = proposals_double.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(0, 2)}); - auto proposals_wh = proposals_double.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(2, 4)}); - auto proposals_xyxy = torch::cat({ - proposals_xy, - proposals_xy + proposals_wh - }, /*dim=*/2).contiguous(); + // Flatten pooled features + auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1}); + auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1}); - // Add batch index - match Python exactly - auto batch_idx_expanded = batch_index.reshape({batch_size, -1, 1}).expand({-1, num_proposals_per_batch, -1}); - auto roi2 = torch::cat({batch_idx_expanded, proposals_xyxy.to(torch::kFloat32)}, /*dim=*/2); - roi2 = roi2.reshape({-1, 5}).to(proposals_xyxy.device()).contiguous(); + // Concatenate features + auto feat_vec = torch::cat({vec1, vec2}, /*dim=*/1); - // Apply ROI pooling - torch::Tensor roi3t = prroi_pool3t->forward(c3_t_att, roi2); - roi3t = roi3t.contiguous(); + // Repeat modulation vectors for each proposal + auto mod1 = modulation[0].repeat({num_proposals, 1}); + auto mod2 = modulation[1].repeat({num_proposals, 1}); - torch::Tensor roi4t = prroi_pool4t->forward(c4_t_att, roi2); - roi4t = roi4t.contiguous(); + // Concatenate modulation vectors + auto mod_vec = torch::cat({mod1, mod2}, /*dim=*/1); - // Apply linear blocks - torch::Tensor fc3_rt_out = fc3_rt.forward(roi3t); - torch::Tensor fc4_rt_out = fc4_rt.forward(roi4t); + // Element-wise multiplication + auto ioufeat = feat_vec * mod_vec; - // Concatenate features with high precision - auto fc3_rt_out_double = fc3_rt_out.to(torch::kFloat64); - auto fc4_rt_out_double = fc4_rt_out.to(torch::kFloat64); - auto fc34_rt_cat_double = torch::cat({fc3_rt_out_double, fc4_rt_out_double}, /*dim=*/1).contiguous(); + // Apply IoU predictor + auto iou_scores = iou_predictor->forward(ioufeat); - // Final prediction with high precision - auto fc34_rt_cat_float = fc34_rt_cat_double.to(torch::kFloat32); + // Reshape back to [batch_size, num_proposals] + iou_scores = iou_scores.reshape({batch_size, num_proposals}); - // Try CPU path if we have issues with CUDA - if (fc34_rt_cat_float.device().is_cuda()) { - try { - auto iou_pred_double = iou_predictor->forward(fc34_rt_cat_float).to(torch::kFloat64); - iou_pred_double = iou_pred_double.reshape({batch_size, num_proposals_per_batch}).contiguous(); - return iou_pred_double.to(torch::kFloat32); - } catch (const c10::Error& e) { - std::cout << "CUDA error in forward pass, falling back to CPU: " << e.what() << std::endl; - // Fall back to CPU - fc34_rt_cat_float = fc34_rt_cat_float.to(torch::kCPU); - } - } + return iou_scores; - // CPU path - auto iou_pred_double = iou_predictor->forward(fc34_rt_cat_float).to(torch::kFloat64); - iou_pred_double = iou_pred_double.reshape({batch_size, num_proposals_per_batch}).contiguous(); - return iou_pred_double.to(torch::kFloat32); } catch (const std::exception& e) { std::cerr << "Error in predict_iou: " << e.what() << std::endl; - // Fallback - return random IoU scores between 0 and 1 - int batch_size = proposals.size(0); - int num_proposals = proposals.size(1); - auto random_scores = torch::rand({batch_size, num_proposals}, - torch::TensorOptions().device(torch::kCPU)); - std::cout << "Returning random fallback IoU scores" << std::endl; + // Return random fallback IoU scores - ensure they're on the same device as input proposals + std::cout << "Returning random fallback IoU scores on device " << proposals.device() << std::endl; + auto options = torch::TensorOptions().dtype(proposals.dtype()).device(proposals.device()); + auto random_scores = torch::rand({proposals.size(0), proposals.size(1)}, options); + return random_scores; } }