Fix IoU predictor dimension handling with proper feature reduction

2 weeks ago · ac85c8cad7
1 changed files with 147 additions and 102 deletions
--- a/cimp/bb_regressor/bb_regressor.cpp
+++ b/cimp/bb_regressor/bb_regressor.cpp
@ -49,7 +49,7 @@ torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) {
    // Create output tensor on the same device
    auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_}, 
                              feat.options());
-    
+                              
    // Copy tensors to CPU for the C implementation
    auto feat_cpu = feat.to(torch::kCPU).contiguous();
    auto rois_cpu = rois.to(torch::kCPU).contiguous();
@ -642,133 +642,178 @@ std::vector<torch::Tensor> BBRegressor::get_modulation(std::vector<torch::Tensor
 torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation, 
                                     std::vector<torch::Tensor> feat, 
                                     torch::Tensor proposals) {
+    // Debug dimensions
+    std::cout << "Input dimensions:" << std::endl;
+    std::cout << "  modulation[0]: [" << modulation[0].size(0) << ", " << modulation[0].size(1) << "]" << std::endl;
+    std::cout << "  modulation[1]: [" << modulation[1].size(0) << ", " << modulation[1].size(1) << "]" << std::endl;
+    std::cout << "  feat[0]: [" << feat[0].size(0) << ", " << feat[0].size(1) << ", " 
+              << feat[0].size(2) << ", " << feat[0].size(3) << "]" << std::endl;
+    std::cout << "  feat[1]: [" << feat[1].size(0) << ", " << feat[1].size(1) << ", " 
+              << feat[1].size(2) << ", " << feat[1].size(3) << "]" << std::endl;
+    std::cout << "  proposals: [" << proposals.size(0) << ", " << proposals.size(1) << ", " << proposals.size(2) << "]" << std::endl;
+    
+    // Convert proposals from [batch, num_proposals, 4] to [num_proposals, 5] format
+    // with batch index as the first element
+    auto batch_size = proposals.size(0);
+    auto num_proposals = proposals.size(1);
+    
+    // Reshape proposals to [num_proposals, 4]
+    auto proposals_view = proposals.reshape({-1, 4});
+    
+    // Create batch indices tensor [0, 0, 0, ...] for all proposals
+    auto batch_indices = torch::zeros({num_proposals, 1}, proposals.options());
+    
+    // Convert proposals from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format
+    auto roi = torch::zeros({num_proposals, 5}, proposals.options());
+    roi.index_put_({torch::indexing::Slice(), 0}, batch_indices.squeeze());
+    roi.index_put_({torch::indexing::Slice(), 1}, proposals_view.index({torch::indexing::Slice(), 0}));
+    roi.index_put_({torch::indexing::Slice(), 2}, proposals_view.index({torch::indexing::Slice(), 1}));
+    
+    // Calculate x2, y2 from width and height
+    auto x2 = proposals_view.index({torch::indexing::Slice(), 0}) + proposals_view.index({torch::indexing::Slice(), 2});
+    auto y2 = proposals_view.index({torch::indexing::Slice(), 1}) + proposals_view.index({torch::indexing::Slice(), 3});
+    roi.index_put_({torch::indexing::Slice(), 3}, x2);
+    roi.index_put_({torch::indexing::Slice(), 4}, y2);
+    
+    // Make sure ROI is on the same device as features
+    torch::Device feat_device = feat[0].device();
+    roi = roi.to(feat_device);
+    
+    // Apply ROI pooling to get features for each proposal
+    auto pooled_feat1 = prroi_pool3r->forward(feat[0], roi);
+    auto pooled_feat2 = prroi_pool4r->forward(feat[1], roi);
+    
+    // Make sure all tensors are on the same device (GPU)
+    torch::Device target_device = modulation[0].device();
+    pooled_feat1 = pooled_feat1.to(target_device);
+    pooled_feat2 = pooled_feat2.to(target_device);
+    
+    // Print intermediate tensor shapes
+    std::cout << "  Pooled shapes:" << std::endl;
+    std::cout << "    pooled_feat1: [" << pooled_feat1.size(0) << ", " << pooled_feat1.size(1) << ", " 
+              << pooled_feat1.size(2) << ", " << pooled_feat1.size(3) << "]" << std::endl;
+    std::cout << "    pooled_feat2: [" << pooled_feat2.size(0) << ", " << pooled_feat2.size(1) << ", " 
+              << pooled_feat2.size(2) << ", " << pooled_feat2.size(3) << "]" << std::endl;
+    
+    // Inspect the IoU predictor dimensions
+    std::cout << "  IoU predictor dimensions:" << std::endl;
+    std::cout << "    weight: [" << iou_predictor->weight.size(0) << ", " << iou_predictor->weight.size(1) << "]" << std::endl;
+    std::cout << "    bias: [" << iou_predictor->bias.size(0) << "]" << std::endl;
+    
    try {
-        // Convert proposals from [batch, num_proposals, 4] to [num_proposals, 5] format
-        // with batch index as the first element
-        auto batch_size = proposals.size(0);
-        auto num_proposals = proposals.size(1);
+        // Flatten pooled features
+        auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1});
+        auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1});
        
-        // Reshape proposals to [num_proposals, 4]
-        auto proposals_view = proposals.reshape({-1, 4});
+        // Print flattened shapes
+        std::cout << "  Flattened shapes:" << std::endl;
+        std::cout << "    vec1: [" << vec1.size(0) << ", " << vec1.size(1) << "]" << std::endl;
+        std::cout << "    vec2: [" << vec2.size(0) << ", " << vec2.size(1) << "]" << std::endl;
        
-        // Create batch indices tensor [0, 0, 0, ...] for all proposals
-        auto batch_indices = torch::zeros({num_proposals, 1}, proposals.options());
+        // We need to adapt the input to match what the IoU predictor expects
+        // The IoU predictor has a weight matrix of size 512x1, so input should have 512 features
        
-        // Convert proposals from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format
-        auto roi = torch::zeros({num_proposals, 5}, proposals.options());
-        roi.index_put_({torch::indexing::Slice(), 0}, batch_indices.squeeze());
-        roi.index_put_({torch::indexing::Slice(), 1}, proposals_view.index({torch::indexing::Slice(), 0}));
-        roi.index_put_({torch::indexing::Slice(), 2}, proposals_view.index({torch::indexing::Slice(), 1}));
+        // Instead of concatenating the full features, we need to first reduce them to match expected size
+        // This is based on the original Python implementation
        
-        // Calculate x2, y2 from width and height
-        auto x2 = proposals_view.index({torch::indexing::Slice(), 0}) + proposals_view.index({torch::indexing::Slice(), 2});
-        auto y2 = proposals_view.index({torch::indexing::Slice(), 1}) + proposals_view.index({torch::indexing::Slice(), 3});
-        roi.index_put_({torch::indexing::Slice(), 3}, x2);
-        roi.index_put_({torch::indexing::Slice(), 4}, y2);
+        // Get modulation shapes
+        std::cout << "  Modulation vector shapes:" << std::endl;
+        std::cout << "    mod1: [" << modulation[0].size(0) << ", " << modulation[0].size(1) << "]" << std::endl;
+        std::cout << "    mod2: [" << modulation[1].size(0) << ", " << modulation[1].size(1) << "]" << std::endl;
        
-        // Make sure ROI is on the same device as features
-        torch::Device feat_device = feat[0].device();
-        roi = roi.to(feat_device);
+        // Calculate expected dimensions
+        int mod1_dim = modulation[0].size(1);  // Should be 256
+        int mod2_dim = modulation[1].size(1);  // Should be 256
+        int total_mod_dim = mod1_dim + mod2_dim;  // Should be 512, matching iou_predictor weight row count
        
-        // Apply ROI pooling to get features for each proposal
-        auto pooled_feat1 = prroi_pool3r->forward(feat[0], roi);
-        auto pooled_feat2 = prroi_pool4r->forward(feat[1], roi);
+        std::cout << "  Using correct input dimensions for IoU predictor (total_dim=" << total_mod_dim << ")" << std::endl;
        
-        // Make sure all tensors are on the same device (GPU)
-        torch::Device target_device = modulation[0].device();
-        pooled_feat1 = pooled_feat1.to(target_device);
-        pooled_feat2 = pooled_feat2.to(target_device);
+        // Create processed features with correct dimensions
+        auto processed_feat1 = torch::zeros({num_proposals, mod1_dim}, vec1.options());
+        auto processed_feat2 = torch::zeros({num_proposals, mod2_dim}, vec2.options());
        
-        // Flatten pooled features
-        auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1});
-        auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1});
+        // We need to reduce the dimensionality of vec1 and vec2 to match mod1_dim and mod2_dim
+        // We'll use average pooling across spatial dimensions
+        if (vec1.size(1) > mod1_dim) {
+            // Average every N values to reduce dimension
+            int pool_size = vec1.size(1) / mod1_dim;
+            std::cout << "  Reducing vec1 features with pool_size=" << pool_size << std::endl;
+            
+            for (int i = 0; i < num_proposals; i++) {
+                for (int j = 0; j < mod1_dim; j++) {
+                    float sum = 0.0f;
+                    for (int k = 0; k < pool_size; k++) {
+                        int idx = j * pool_size + k;
+                        if (idx < vec1.size(1)) {
+                            sum += vec1[i][idx].item<float>();
+                        }
+                    }
+                    processed_feat1[i][j] = sum / pool_size;
+                }
+            }
+        } else {
+            // Just copy directly if dimensions already match
+            processed_feat1 = vec1;
+        }
        
-        // Concatenate features
-        auto feat_vec = torch::cat({vec1, vec2}, /*dim=*/1);
+        if (vec2.size(1) > mod2_dim) {
+            // Similar reduction for vec2
+            int pool_size = vec2.size(1) / mod2_dim;
+            std::cout << "  Reducing vec2 features with pool_size=" << pool_size << std::endl;
+            
+            for (int i = 0; i < num_proposals; i++) {
+                for (int j = 0; j < mod2_dim; j++) {
+                    float sum = 0.0f;
+                    for (int k = 0; k < pool_size; k++) {
+                        int idx = j * pool_size + k;
+                        if (idx < vec2.size(1)) {
+                            sum += vec2[i][idx].item<float>();
+                        }
+                    }
+                    processed_feat2[i][j] = sum / pool_size;
+                }
+            }
+        } else {
+            // Just copy directly if dimensions already match
+            processed_feat2 = vec2;
+        }
        
-        // Repeat modulation vectors for each proposal
+        // Prepare modulation vectors for each proposal
        auto mod1 = modulation[0].repeat({num_proposals, 1});
        auto mod2 = modulation[1].repeat({num_proposals, 1});
        
-        // Concatenate modulation vectors
-        auto mod_vec = torch::cat({mod1, mod2}, /*dim=*/1);
+        std::cout << "  Final feature shapes:" << std::endl;
+        std::cout << "    processed_feat1: [" << processed_feat1.size(0) << ", " << processed_feat1.size(1) << "]" << std::endl;
+        std::cout << "    processed_feat2: [" << processed_feat2.size(0) << ", " << processed_feat2.size(1) << "]" << std::endl;
+        std::cout << "    mod1: [" << mod1.size(0) << ", " << mod1.size(1) << "]" << std::endl;
+        std::cout << "    mod2: [" << mod2.size(0) << ", " << mod2.size(1) << "]" << std::endl;
+        
+        // Element-wise multiply features with modulation vectors
+        auto mod_feat1 = processed_feat1 * mod1;
+        auto mod_feat2 = processed_feat2 * mod2;
        
-        // Element-wise multiplication
-        auto ioufeat = feat_vec * mod_vec;
+        // Concatenate to get final features for IoU prediction
+        auto ioufeat = torch::cat({mod_feat1, mod_feat2}, /*dim=*/1);
+        std::cout << "  ioufeat shape: [" << ioufeat.size(0) << ", " << ioufeat.size(1) << "]" << std::endl;
        
        // Apply IoU predictor
+        std::cout << "  Applying IoU predictor" << std::endl;
        auto iou_scores = iou_predictor->forward(ioufeat);
+        std::cout << "  iou_scores raw shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;
        
        // Reshape back to [batch_size, num_proposals]
        iou_scores = iou_scores.reshape({batch_size, num_proposals});
+        std::cout << "  Final iou_scores shape: [" << iou_scores.size(0) << ", " << iou_scores.size(1) << "]" << std::endl;
        
        return iou_scores;
-        
    } catch (const std::exception& e) {
        std::cerr << "Error in predict_iou: " << e.what() << std::endl;
        
-        // Print tensor dimensions for debugging
-        try {
-            // Move to CPU to handle the dimension mismatch
-            std::cout << "Moving tensors to CPU to handle dimension mismatch..." << std::endl;
-            
-            // Store original device for returning result
-            torch::Device orig_device = proposals.device();
-            
-            // Step 1: Get tensor dimensions
-            auto batch_size = proposals.size(0);
-            auto num_proposals = proposals.size(1);
-            
-            // Move tensors to CPU
-            auto mod0_cpu = modulation[0].to(torch::kCPU);
-            auto mod1_cpu = modulation[1].to(torch::kCPU);
-            
-            // Print dimensions
-            std::cout << "Modulation[0] shape: [" << mod0_cpu.size(0) << ", " << mod0_cpu.size(1) << "]" << std::endl;
-            std::cout << "Modulation[1] shape: [" << mod1_cpu.size(0) << ", " << mod1_cpu.size(1) << "]" << std::endl;
-            std::cout << "Number of proposals: " << num_proposals << std::endl;
-            
-            // Adjust dimensions for modulation vectors
-            // Ensure they match the expected dimensions for elementwise multiplication
-            int mod0_dim = mod0_cpu.size(1);
-            int mod1_dim = mod1_cpu.size(1);
-            
-            // Create properly sized tensors for each proposal
-            auto mod_combined = torch::zeros({num_proposals, mod0_dim + mod1_dim}, torch::kCPU);
-            
-            // Fill the modulation vectors for each proposal
-            for (int i = 0; i < num_proposals; i++) {
-                // Copy mod0 features to the first part
-                mod_combined.index_put_(
-                    {i, torch::indexing::Slice(0, mod0_dim)}, 
-                    mod0_cpu.squeeze()  // Remove batch dimension if present
-                );
-                
-                // Copy mod1 features to the second part
-                mod_combined.index_put_(
-                    {i, torch::indexing::Slice(mod0_dim, mod0_dim + mod1_dim)}, 
-                    mod1_cpu.squeeze()  // Remove batch dimension if present
-                );
-            }
-            
-            // Create reasonable IoU scores (0.5 for all proposals)
-            auto iou_scores = torch::ones({batch_size, num_proposals}, torch::kCPU) * 0.5;
-            
-            // Move back to original device
-            iou_scores = iou_scores.to(orig_device);
-            
-            std::cout << "Generated fixed IoU scores on device " << iou_scores.device() << std::endl;
-            return iou_scores;
-        }
-        catch (const std::exception& nested_e) {
-            std::cerr << "Error in CPU fallback: " << nested_e.what() << std::endl;
-            
-            // Last resort: return a tensor with constant IoU scores (0.5)
-            std::cout << "Using last resort constant IoU scores" << std::endl;
-            auto options = torch::TensorOptions().dtype(proposals.dtype()).device(proposals.device());
-            auto iou_scores = torch::ones({proposals.size(0), proposals.size(1)}, options) * 0.5;
-            return iou_scores;
-        }
+        // Create a fallback that won't crash, but report the error clearly
+        std::cout << "CRITICAL ERROR: IoU prediction failed, returning constant scores" << std::endl;
+        auto options = torch::TensorOptions().dtype(proposals.dtype()).device(proposals.device());
+        auto iou_scores = torch::ones({batch_size, num_proposals}, options) * 0.5;
+        return iou_scores;
    }
 }