From 8ed90f273edb284a759f8afc194aecb8a28d9bdb Mon Sep 17 00:00:00 2001
From: mht <tohidihassan9@gmail.com>
Date: Sun, 1 Jun 2025 14:01:48 +0330
Subject: [PATCH] Fix BBRegressor: Apply modulation before PrRoIPool in
 predict_iou

---
 cimp/bb_regressor/bb_regressor.cpp | 126 +++++++++++++++++------------
 1 file changed, 73 insertions(+), 53 deletions(-)

diff --git a/cimp/bb_regressor/bb_regressor.cpp b/cimp/bb_regressor/bb_regressor.cpp
index b48c49c..121ca79 100644
--- a/cimp/bb_regressor/bb_regressor.cpp
+++ b/cimp/bb_regressor/bb_regressor.cpp
@@ -647,19 +647,56 @@ torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation,
     auto feat_device = feat[0].device();
     roi = roi.to(feat_device);
     
-    // Apply ROI pooling to get features for each proposal
-    auto pooled_feat1 = prroi_pool3t->forward(feat[0], roi); // Output: [batch_size * num_proposals, C, 5, 5]
-    auto pooled_feat2 = prroi_pool4t->forward(feat[1], roi); // Output: [batch_size * num_proposals, C, 3, 3]
+    // Apply modulation vectors BEFORE PrRoIPooling
+    auto mod0_4d = modulation[0].to(feat_device); 
+    auto mod1_4d = modulation[1].to(feat_device); 
+
+    if (mod0_4d.dim() == 2) {
+        mod0_4d = mod0_4d.reshape({mod0_4d.size(0), mod0_4d.size(1), 1, 1});
+    }
+    if (mod1_4d.dim() == 2) {
+        mod1_4d = mod1_4d.reshape({mod1_4d.size(0), mod1_4d.size(1), 1, 1});
+    }
     
-    std::cout << "  Pooled shapes:" << std::endl;
-    std::cout << "    pooled_feat1 (from prroi_pool3t on feat[0]): [" << pooled_feat1.sizes() << "] dev: " << pooled_feat1.device() << std::endl;
-    std::cout << "    pooled_feat2 (from prroi_pool4t on feat[1]): [" << pooled_feat2.sizes() << "] dev: " << pooled_feat2.device() << std::endl;
+    // Ensure modulation vectors are broadcastable with features
+    // Features (feat[0], feat[1]) are [batch_size, channels, H, W]
+    // Modulation (mod0_4d, mod1_4d) should be [batch_size, channels, 1, 1]
+    // If num_proposals > 1, the pooling happens on features that are effectively repeated.
+    // The modulation is per-image, not per-proposal before pooling.
+    
+    torch::Tensor modulated_feat0 = feat[0] * mod0_4d;
+    torch::Tensor modulated_feat1 = feat[1] * mod1_4d;
+
+    // Apply ROI pooling to get features for each proposal from MODULATED features
+    auto pooled_feat1 = prroi_pool3t->forward(modulated_feat0, roi); // Output: [batch_size * num_proposals, C, 5, 5]
+    auto pooled_feat2 = prroi_pool4t->forward(modulated_feat1, roi);
+    
+    std::cout << "  Modulated and Pooled shapes:" << std::endl;
+    std::cout << "    pooled_feat1 (from prroi_pool3t on modulated_feat0): [" << pooled_feat1.sizes() << "] dev: " << pooled_feat1.device() << std::endl;
+    std::cout << "    pooled_feat2 (from prroi_pool4t on modulated_feat1): [" << pooled_feat2.sizes() << "] dev: " << pooled_feat2.device() << std::endl;
     
     std::cout << "  IoU predictor dimensions:" << std::endl;
     std::cout << "    weight: [" << iou_predictor->weight.sizes() << "]" << std::endl;
     std::cout << "    bias: [" << iou_predictor->bias.sizes() << "]" << std::endl;
     
     try {
+        // The feat_prod_0 and feat_prod_1 are now directly the pooled_feat1 and pooled_feat2
+        // as modulation was applied before pooling.
+        auto x0 = fc3_rt.forward(pooled_feat1);
+        auto x1 = fc4_rt.forward(pooled_feat2);
+        
+        auto ioufeat_final = torch::cat({x0, x1}, 1).contiguous();
+        
+        // Ensure iou_predictor is on the correct device
+        iou_predictor->to(target_device); 
+        
+        auto iou_scores = iou_predictor->forward(ioufeat_final);
+        
+        // Ensure iou_scores is on the correct device before returning
+        iou_scores = iou_scores.to(target_device);
+
+        // The following block for feat_prod_0 and feat_prod_1 is no longer needed as modulation is done pre-pool.
+        /*
         auto mod0_4d = modulation[0].to(target_device); 
         auto mod1_4d = modulation[1].to(target_device); 
 
@@ -682,59 +719,42 @@ torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation,
         std::cout << "    mod1_4d: [" << mod1_4d.sizes() << "] dev: " << mod1_4d.device() << std::endl;
         
         auto feat_prod_0 = pooled_feat1 * mod0_4d; 
-        auto feat_prod_1 = pooled_feat2 * mod1_4d; 
-        std::cout << "  After element-wise product with modulation:\n    feat_prod_0 (pooled_feat1 * mod0_4d): [" << feat_prod_0.sizes() << "] dev: " << feat_prod_0.device() << "\n    feat_prod_1 (pooled_feat2 * mod1_4d): [" << feat_prod_1.sizes() << "] dev: " << feat_prod_1.device() << std::endl;
-        
-        std::cout << "  Applying fc3_rt to feat_prod_0..." << std::endl;
-        auto x0 = fc3_rt.forward(feat_prod_0); // Corrected: . instead of ->
-        std::cout << "  Applying fc4_rt to feat_prod_1..." << std::endl;
-        auto x1 = fc4_rt.forward(feat_prod_1); // Corrected: . instead of ->
-        std::cout << "  After fc_rt blocks:\n    x0 (fc3_rt output): [" << x0.sizes() << "] dev: " << x0.device() << "\n    x1 (fc4_rt output): [" << x1.sizes() << "] dev: " << x1.device() << std::endl;
+        auto feat_prod_1 = pooled_feat2 * mod1_4d;
 
-        auto ioufeat_final = torch::cat(std::vector<torch::Tensor>{x0, x1}, 1).contiguous(); // Corrected: std::vector wrapper
-        std::cout << "  Concatenated ioufeat_final: [" << ioufeat_final.sizes() << "] dev: " << ioufeat_final.device() << std::endl;
+        std::cout << "  Feature product shapes (pooled_feat * mod_vec):" << std::endl;
+        std::cout << "    feat_prod_0: [" << feat_prod_0.sizes() << "] dev: " << feat_prod_0.device() << std::endl;
+        std::cout << "    feat_prod_1: [" << feat_prod_1.sizes() << "] dev: " << feat_prod_1.device() << std::endl;
+        
+        // Forward through linear blocks
+        // Ensure fc3_rt and fc4_rt are on the correct device
+        fc3_rt.to(target_device); 
+        fc4_rt.to(target_device);
 
-        torch::Tensor iou_scores;
-        try {
-            std::cout << "  Applying final iou_predictor on GPU" << std::endl;
-            iou_predictor->to(target_device); 
-            iou_scores = iou_predictor->forward(ioufeat_final.to(target_device));
-            std::cout << "  Final iou_predictor on GPU successful. Output scores shape: [" << iou_scores.sizes() << "]" << std::endl;
-
-        } catch (const std::exception& cuda_error) {
-            std::cout << "  GPU iou_predictor->forward() failed: " << cuda_error.what() << std::endl;
-            std::cout << "  Falling back to CPU for final iou_predictor" << std::endl;
-            
-            auto ioufeat_final_cpu = ioufeat_final.to(torch::kCPU).contiguous();
-            auto weight_cpu = iou_predictor->weight.to(torch::kCPU).contiguous();
-            auto bias_cpu = torch::Tensor(); 
-            if (iou_predictor->bias.defined()) {
-                bias_cpu = iou_predictor->bias.to(torch::kCPU).contiguous();
-            }
+        auto x0 = fc3_rt.forward(feat_prod_0);
+        auto x1 = fc4_rt.forward(feat_prod_1);
+        
+        std::cout << "  fc_rt output shapes:" << std::endl;
+        std::cout << "    x0 (fc3_rt output): [" << x0.sizes() << "] dev: " << x0.device() << std::endl;
+        std::cout << "    x1 (fc4_rt output): [" << x1.sizes() << "] dev: " << x1.device() << std::endl;
 
-            std::cout << "    DEBUG CPU Fallback: ioufeat_final_cpu device: " << ioufeat_final_cpu.device() << std::endl;
-            std::cout << "    DEBUG CPU Fallback: weight_cpu device: " << weight_cpu.device() << std::endl;
-            if (bias_cpu.defined()) {
-                 std::cout << "    DEBUG CPU Fallback: bias_cpu device: " << bias_cpu.device() << std::endl;
-            } else {
-                 std::cout << "    DEBUG CPU Fallback: bias_cpu is undefined." << std::endl;
-            }
-            
-            try {
-                iou_scores = torch::nn::functional::linear(ioufeat_final_cpu, weight_cpu, bias_cpu);
-                std::cout << "  CPU fallback torch::nn::functional::linear() successful. Output device: " << iou_scores.device() << std::endl;
-            } catch (const std::exception& cpu_fwd_error) {
-                std::cerr << "ERROR during CPU torch::nn::functional::linear(): " << cpu_fwd_error.what() << std::endl;
-                iou_predictor->to(target_device); 
-                throw; 
-            }
-            
-            iou_predictor->to(target_device); 
+        auto ioufeat_final = torch::cat({x0, x1}, 1).contiguous();
+        
+        std::cout << "  ioufeat_final shape: [" << ioufeat_final.sizes() << "] dev: " << ioufeat_final.device() << std::endl;
+        
+        // Ensure iou_predictor is on the correct device
+        iou_predictor->to(target_device); 
+        
+        auto iou_scores = iou_predictor->forward(ioufeat_final);
+        
+        // Ensure iou_scores is on the correct device before returning
+        iou_scores = iou_scores.to(target_device);
+        */
+        // Ensure iou_scores is on the correct device before returning.
+        // This was already done above, but as a final check:
+        if (iou_scores.device() != target_device) {
             iou_scores = iou_scores.to(target_device);
         }
         
-        std::cout << "  iou_scores raw shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;
-        
         iou_scores = iou_scores.reshape({batch_size, num_proposals});
         std::cout << "  Final iou_scores shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;